Scraping actor profiles from FreeOnes.

This commit is contained in:
2019-11-17 03:56:45 +01:00
parent abcdb52335
commit e8130c3634
268 changed files with 19161 additions and 102 deletions

View File

@@ -1,6 +1,9 @@
'use strict';
const Promise = require('bluebird');
const knex = require('./knex');
const argv = require('./argv');
const scrapers = require('./scrapers/scrapers');
const whereOr = require('./utils/where-or');
async function curateActor(actor) {
@@ -13,7 +16,20 @@ async function curateActor(actor) {
description: actor.description,
birthdate: actor.birthdate && new Date(actor.birthdate),
country: actor.country_alpha2,
city: actor.city,
residencePlace: actor.residence_place,
residenceCountry: actor.residence_country_alpha2
? {
alpha2: actor.residence_country_alpha2,
name: actor.residence_country_name,
}
: null,
birthPlace: actor.birth_place,
birthCountry: actor.birth_country_alpha2
? {
alpha2: actor.birth_country_alpha2,
name: actor.birth_country_name,
}
: null,
ethnicity: actor.ethnicity,
height: actor.height,
boobSize: actor.boobs_size,
@@ -27,32 +43,113 @@ function curateActors(releases) {
return Promise.all(releases.map(async release => curateActor(release)));
}
function curateScrapedActor(actor) {
return {
id: actor.id,
name: actor.name,
slug: actor.name.toLowerCase().replace(/\s+/g, '-'),
birthdate: actor.birthdate,
description: actor.description,
gender: actor.gender,
ethnicity: actor.ethnicity,
birth_country_alpha2: actor.birthCountry,
residence_country_alpha2: actor.residenceCountry,
birth_place: actor.birthPlace,
residence_place: actor.residencePlace,
active: actor.active,
boobs_size: actor.boobs && actor.boobs.size,
boobs_natural: actor.boobs && actor.boobs.natural,
height: actor.height,
hair: actor.hair,
eyes: actor.eyes,
tattoos: actor.tattoos,
piercings: actor.piercings,
};
}
async function fetchActors(queryObject) {
const releases = await knex('actors')
.select(
'actors.*',
'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name',
'residence_countries.alpha2 as residence_country_alpha2', 'residence_countries.name as residence_country_name',
)
.leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2')
.leftJoin('countries as residence_countries', 'actors.residence_country_alpha2', 'residence_countries.alpha2')
.where(builder => whereOr(queryObject, 'actors', builder))
.limit(100);
return curateActors(releases);
}
async function storeActor(actor) {
const curatedActor = curateScrapedActor(actor);
const actorEntries = await knex('actors')
.insert(curatedActor)
.returning('*');
if (actorEntries.length) {
const actorEntry = actorEntries[0];
console.log(`Added new entry for actor '${actor.name}'`);
return actorEntry;
}
console.error(`Unable to save profile for '${actor.name}'`);
return null;
}
async function updateActor(actorEntry, actor) {
const curatedActor = curateScrapedActor(actor);
const actorEntries = await knex('actors')
.where({ id: actorEntry.id })
.update(curatedActor)
.returning('*');
console.log(`Updated entry for actor '${actor.name}'`);
return actorEntries[0];
}
async function scrapeActors(actorNames) {
await Promise.map(actorNames || argv.actors, async (actorName) => {
const [actorEntry] = await fetchActors({ name: actorName });
const profiles = await Promise.all(Object.values(scrapers.actors).map(scraper => scraper.fetchActor(actorName)));
if (actorEntry) {
return updateActor(actorEntry, profiles[0]);
}
return storeActor(profiles[0]);
}, {
concurrency: 5,
});
}
async function storeActors(release, releaseEntry) {
const actors = await knex('actors').whereIn('name', release.actors);
const newActors = release.actors.filter(actorName => !actors.some(actor => actor.name === actorName));
const actorEntries = await knex('actors').whereIn('name', release.actors);
const { rows: insertedActors } = newActors.length
? await knex.raw(`${knex('actors').insert(newActors.map(actorName => ({
name: actorName,
slug: actorName.toLowerCase().replace(/\s+/g, '-'),
})))} ON CONFLICT DO NOTHING RETURNING *`)
: { rows: [] };
const newActors = release.actors
.map(actorName => actorName.trim())
.filter(actorName => !actorEntries.some(actor => actor.name === actorName));
return knex('actors_associated').insert(actors.concat(insertedActors).map(actor => ({
release_id: releaseEntry.id,
actor_id: actor.id,
})), '*');
const newActorEntries = await Promise.all(newActors.map(async actorName => storeActor({ name: actorName })));
await knex('actors_associated')
.insert(actorEntries.concat(newActorEntries).map(actor => ({
release_id: releaseEntry.id,
actor_id: actor.id,
})), '*');
scrapeActors(newActorEntries.map(actor => actor.name));
}
module.exports = {
fetchActors,
scrapeActors,
storeActors,
};

View File

@@ -6,6 +6,7 @@ const initServer = require('./web/server');
const scrapeSites = require('./scrape-sites');
const scrapeRelease = require('./scrape-release');
const { scrapeActors } = require('./actors');
async function init() {
if (argv.url) {
@@ -23,6 +24,13 @@ async function init() {
return;
}
if (argv.actors) {
await scrapeActors();
knex.destroy();
return;
}
await initServer();
}

View File

@@ -19,11 +19,21 @@ const { argv } = yargs
type: 'array',
alias: 'site',
})
.option('actors', {
describe: 'Scrape actors by name or slug',
type: 'array',
alias: 'actor',
})
.option('deep', {
describe: 'Fetch details for all releases',
type: 'boolean',
default: true,
})
.option('redownload', {
describe: 'Don\'t ignore duplicates, update existing entries',
type: 'boolean',
alias: 'force',
})
.option('url', {
describe: 'Scrape scene info from URL',
type: 'string',
@@ -33,6 +43,7 @@ const { argv } = yargs
describe: 'Don\'t fetch scenes older than',
type: 'string',
default: config.fetchAfter.join(' '),
alias: 'limit',
})
.option('pages', {
describe: 'Limit pages to scrape per site. Only used when no dates are found or --after is unset.',

View File

@@ -100,52 +100,6 @@ function curateScrapedRelease(release) {
};
}
async function storeRelease(release) {
const curatedRelease = curateScrapedRelease(release);
const releaseEntries = await knex('releases')
.insert(curatedRelease)
.returning('*');
if (releaseEntries.length) {
const releaseEntry = releaseEntries[0];
console.log(`Stored (${release.site.name}, ${releaseEntry.id}) "${release.title}"`);
await createMediaDirectory(release, releaseEntry.id);
await Promise.all([
storeActors(release, releaseEntry),
storeTags(release, releaseEntry),
storePhotos(release, releaseEntry),
storePoster(release, releaseEntry),
storeTrailer(release, releaseEntry),
]);
return releaseEntry.id;
}
console.error(`Unable to save scene to database, possible collision: "${release.title}" (${release.site.name})`);
return null;
}
async function storeReleases(releases) {
return Promise.map(releases, async (release) => {
try {
const releaseId = await storeRelease(release);
return releaseId;
} catch (error) {
console.error(error);
return null;
}
}, {
concurrency: 2,
});
}
function commonQuery(queryBuilder, {
filter = [],
after = new Date(0), // January 1970
@@ -187,6 +141,15 @@ async function fetchReleases(queryObject = {}, options = {}) {
return curateReleases(releases);
}
async function fetchReleasesByEntryIds(entryIds, queryObject = {}, options = {}) {
const releases = await knex('releases')
.modify(commonQuery, options)
.whereIn('entry_id', entryIds)
.andWhere(builder => whereOr(queryObject, 'releases', builder));
return curateReleases(releases);
}
async function fetchSiteReleases(queryObject, options = {}) {
const releases = await knex('releases')
.modify(commonQuery, options)
@@ -229,6 +192,56 @@ async function fetchTagReleases(queryObject, options = {}) {
return curateReleases(releases);
}
async function storeRelease(release) {
const curatedRelease = curateScrapedRelease(release);
const releaseEntries = await knex('releases')
.insert(curatedRelease)
.returning('*');
if (releaseEntries.length) {
const releaseEntry = releaseEntries[0];
console.log(`Stored (${release.site.name}, ${releaseEntry.id}) "${release.title}"`);
await createMediaDirectory(release, releaseEntry.id);
await Promise.all([
storeActors(release, releaseEntry),
storeTags(release, releaseEntry),
storePhotos(release, releaseEntry),
storePoster(release, releaseEntry),
storeTrailer(release, releaseEntry),
]);
return releaseEntry.id;
}
console.error(`Unable to save scene to database, possible collision: "${release.title}" (${release.site.name})`);
return null;
}
async function storeReleases(releases) {
const existingReleases = await fetchReleasesByEntryIds(releases.map(release => release.entryId));
console.log(existingReleases);
return Promise.map(releases, async (release) => {
try {
const releaseId = await storeRelease(release);
return releaseId;
} catch (error) {
console.error(error);
return null;
}
}, {
concurrency: 2,
});
}
module.exports = {
fetchReleases,
fetchActorReleases,

View File

@@ -4,7 +4,7 @@ const config = require('config');
const argv = require('./argv');
const scrapers = require('./scrapers/scrapers');
const { storeRelease } = require('./releases');
const { storeReleases } = require('./releases');
const { findSiteByUrl } = require('./sites');
const { findNetworkByUrl } = require('./networks');
@@ -30,7 +30,7 @@ async function findSite(url, release) {
async function scrapeRelease(url, release, deep = false) {
const site = await findSite(url, release);
const scraper = scrapers[site.slug] || scrapers[site.network.slug];
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
if (!site) {
throw new Error('Could not find site in database');
@@ -48,7 +48,7 @@ async function scrapeRelease(url, release, deep = false) {
if (!deep && argv.save) {
// don't store release when called by site scraper
const releaseId = await storeRelease(scene);
const releaseId = await storeReleases([scene]);
console.log(`http://${config.web.host}:${config.web.port}/scene/${releaseId}`);
}

View File

@@ -36,7 +36,7 @@ async function scrapeUniqueReleases(scraper, site, afterDate = getAfterDate(), a
return [];
}
const duplicateReleaseIds = await findDuplicateReleaseIds(latestReleases, accReleases);
const duplicateReleaseIds = argv.redownload ? new Set() : await findDuplicateReleaseIds(latestReleases, accReleases);
const uniqueReleases = latestReleases
.filter(release => !duplicateReleaseIds.has(String(release.entryId)) // release is already in database
@@ -107,7 +107,7 @@ async function scrapeReleases() {
console.log(`Found ${sites.length} sites in database`);
await Promise.map(sites, async (site) => {
const scraper = scrapers[site.slug] || scrapers[site.network.slug];
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
if (!scraper) {
console.warn(`No scraper found for '${site.name}' (${site.slug})`);

136
src/scrapers/freeones.js Normal file
View File

@@ -0,0 +1,136 @@
'use strict';
/* eslint-disable newline-per-chained-call */
const bhttp = require('bhttp');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const knex = require('../knex');
async function scrapeActorFrontpage(html, url, name) {
const { document } = new JSDOM(html).window;
const bioEl = document.querySelector('.dashboard-bio-list');
const bioUrl = `https:${document.querySelector('.seemore a').href}`;
const keys = Array.from(bioEl.querySelectorAll('dt'), el => el.textContent.trim());
const values = Array.from(bioEl.querySelectorAll('dd'), el => el.textContent.trim());
const bio = keys.reduce((acc, key, index) => ({ ...acc, [key]: values[index] }), {});
const birthdateString = bio['Date of Birth:'];
const birthdate = moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate();
const boobsSizeString = bio['Measurements:'];
const boobsSize = boobsSizeString === '??-??-??' ? null : boobsSizeString;
const boobsNatural = bio['Fake Boobs:'] === 'No';
const active = bio['Career Status:'].trim() === 'Active';
const residenceCountryName = bio['Country of Origin:'];
const countryEntry = await knex('countries').where({ name: residenceCountryName }).first();
const residenceCountry = countryEntry ? countryEntry.alpha2 : null;
const birthPlace = bio['Place of Birth:'];
const hair = bio['Hair Color:'].toLowerCase();
const eyes = bio['Eye Color:'].toLowerCase();
const piercingsString = bio['Piercings:'];
const piercings = piercingsString === 'None' ? null : piercingsString;
const tattoosString = bio['Tattoos:'];
const tattoos = tattoosString === 'Unknown (add)' || tattoosString === 'None' ? null : tattoosString;
const social = Array.from(bioEl.querySelectorAll('.dashboard-socialmedia a'), el => el.href);
return {
bio: {
name,
gender: 'female',
birthdate,
residenceCountry,
birthPlace,
boobs: {
size: boobsSize,
natural: boobsNatural,
},
hair,
eyes,
piercings,
tattoos,
active,
social,
},
url: bioUrl,
};
}
async function scrapeActorBio(html, frontpageBio, url, name) {
const { document } = new JSDOM(html).window;
const bioEl = document.querySelector('#biographyTable');
const keys = Array.from(bioEl.querySelectorAll('td:nth-child(1)'), el => el.textContent.trim());
const values = Array.from(bioEl.querySelectorAll('td:nth-child(2)'), el => el.textContent.trim());
const bio = keys.reduce((acc, key, index) => ({ ...acc, [key]: values[index] }), {});
const birthdateString = bio['Date of Birth:'];
const birthdate = moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate();
const active = bio['Career Status:'].trim() === 'Active';
const boobsSizeString = bio['Measurements:'];
const boobsSize = boobsSizeString === '??-??-??' ? null : boobsSizeString;
const boobsNatural = bio['Fake boobs:'] === 'No';
const ethnicity = bio['Ethnicity:'];
const residenceCountryName = bio['Country of Origin:'];
const countryEntry = await knex('countries').where({ name: residenceCountryName }).first();
const residenceCountry = countryEntry ? countryEntry.alpha2 : null;
const birthPlace = bio['Place of Birth:'];
const hair = bio['Hair Color:'].toLowerCase();
const eyes = bio['Eye Color:'].toLowerCase();
const piercingsString = bio['Piercings:'];
const piercings = piercingsString === 'None' ? null : piercingsString;
const tattoosString = bio['Tattoos:'];
const tattoos = tattoosString === undefined || tattoosString === 'Unknown (add)' || tattoosString === 'None' ? null : tattoosString;
const social = Array.from(bioEl.querySelectorAll('#socialmedia a'), el => el.href);
return {
...frontpageBio,
name,
gender: 'female',
birthdate,
residenceCountry,
birthPlace,
ethnicity,
boobs: {
size: boobsSize,
natural: boobsNatural,
},
hair,
eyes,
piercings,
tattoos,
active,
social,
};
}
async function fetchActor(actorName) {
const slug = actorName.replace(' ', '_');
const frontpageUrl = `https://freeones.com/html/v_links/${slug}`;
const resFrontpage = await bhttp.get(frontpageUrl);
const { url, bio } = await scrapeActorFrontpage(resFrontpage.body.toString(), frontpageUrl, actorName);
const resBio = await bhttp.get(url);
return scrapeActorBio(resBio.body.toString(), bio, url, actorName);
}
module.exports = {
fetchActor,
};

View File

@@ -1,5 +1,6 @@
'use strict';
// releases
const twentyonesextury = require('./21sextury');
const bangbros = require('./bangbros');
const blowpass = require('./blowpass');
@@ -19,24 +20,32 @@ const realitykings = require('./realitykings');
const vixen = require('./vixen');
const xempire = require('./xempire');
// actors
const freeones = require('./freeones');
module.exports = {
'21sextury': twentyonesextury,
bangbros,
blowpass,
brazzers,
ddfnetwork,
dogfart,
dogfartnetwork: dogfart,
evilangel,
julesjordan,
kink,
legalporno,
mikeadriano,
mofos,
pervcity,
private: privateNetwork,
naughtyamerica,
realitykings,
vixen,
xempire,
releases: {
'21sextury': twentyonesextury,
bangbros,
blowpass,
brazzers,
ddfnetwork,
dogfart,
dogfartnetwork: dogfart,
evilangel,
julesjordan,
kink,
legalporno,
mikeadriano,
mofos,
pervcity,
private: privateNetwork,
naughtyamerica,
realitykings,
vixen,
xempire,
},
actors: {
freeones,
},
};