Scraping actor profiles from FreeOnes.
This commit is contained in:
123
src/actors.js
123
src/actors.js
@@ -1,6 +1,9 @@
|
||||
'use strict';
|
||||
|
||||
const Promise = require('bluebird');
|
||||
const knex = require('./knex');
|
||||
const argv = require('./argv');
|
||||
const scrapers = require('./scrapers/scrapers');
|
||||
const whereOr = require('./utils/where-or');
|
||||
|
||||
async function curateActor(actor) {
|
||||
@@ -13,7 +16,20 @@ async function curateActor(actor) {
|
||||
description: actor.description,
|
||||
birthdate: actor.birthdate && new Date(actor.birthdate),
|
||||
country: actor.country_alpha2,
|
||||
city: actor.city,
|
||||
residencePlace: actor.residence_place,
|
||||
residenceCountry: actor.residence_country_alpha2
|
||||
? {
|
||||
alpha2: actor.residence_country_alpha2,
|
||||
name: actor.residence_country_name,
|
||||
}
|
||||
: null,
|
||||
birthPlace: actor.birth_place,
|
||||
birthCountry: actor.birth_country_alpha2
|
||||
? {
|
||||
alpha2: actor.birth_country_alpha2,
|
||||
name: actor.birth_country_name,
|
||||
}
|
||||
: null,
|
||||
ethnicity: actor.ethnicity,
|
||||
height: actor.height,
|
||||
boobSize: actor.boobs_size,
|
||||
@@ -27,32 +43,113 @@ function curateActors(releases) {
|
||||
return Promise.all(releases.map(async release => curateActor(release)));
|
||||
}
|
||||
|
||||
function curateScrapedActor(actor) {
|
||||
return {
|
||||
id: actor.id,
|
||||
name: actor.name,
|
||||
slug: actor.name.toLowerCase().replace(/\s+/g, '-'),
|
||||
birthdate: actor.birthdate,
|
||||
description: actor.description,
|
||||
gender: actor.gender,
|
||||
ethnicity: actor.ethnicity,
|
||||
birth_country_alpha2: actor.birthCountry,
|
||||
residence_country_alpha2: actor.residenceCountry,
|
||||
birth_place: actor.birthPlace,
|
||||
residence_place: actor.residencePlace,
|
||||
active: actor.active,
|
||||
boobs_size: actor.boobs && actor.boobs.size,
|
||||
boobs_natural: actor.boobs && actor.boobs.natural,
|
||||
height: actor.height,
|
||||
hair: actor.hair,
|
||||
eyes: actor.eyes,
|
||||
tattoos: actor.tattoos,
|
||||
piercings: actor.piercings,
|
||||
};
|
||||
}
|
||||
|
||||
async function fetchActors(queryObject) {
|
||||
const releases = await knex('actors')
|
||||
.select(
|
||||
'actors.*',
|
||||
'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name',
|
||||
'residence_countries.alpha2 as residence_country_alpha2', 'residence_countries.name as residence_country_name',
|
||||
)
|
||||
.leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2')
|
||||
.leftJoin('countries as residence_countries', 'actors.residence_country_alpha2', 'residence_countries.alpha2')
|
||||
.where(builder => whereOr(queryObject, 'actors', builder))
|
||||
.limit(100);
|
||||
|
||||
return curateActors(releases);
|
||||
}
|
||||
|
||||
async function storeActor(actor) {
|
||||
const curatedActor = curateScrapedActor(actor);
|
||||
|
||||
const actorEntries = await knex('actors')
|
||||
.insert(curatedActor)
|
||||
.returning('*');
|
||||
|
||||
if (actorEntries.length) {
|
||||
const actorEntry = actorEntries[0];
|
||||
|
||||
console.log(`Added new entry for actor '${actor.name}'`);
|
||||
|
||||
return actorEntry;
|
||||
}
|
||||
|
||||
console.error(`Unable to save profile for '${actor.name}'`);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function updateActor(actorEntry, actor) {
|
||||
const curatedActor = curateScrapedActor(actor);
|
||||
|
||||
const actorEntries = await knex('actors')
|
||||
.where({ id: actorEntry.id })
|
||||
.update(curatedActor)
|
||||
.returning('*');
|
||||
|
||||
console.log(`Updated entry for actor '${actor.name}'`);
|
||||
|
||||
return actorEntries[0];
|
||||
}
|
||||
|
||||
async function scrapeActors(actorNames) {
|
||||
await Promise.map(actorNames || argv.actors, async (actorName) => {
|
||||
const [actorEntry] = await fetchActors({ name: actorName });
|
||||
const profiles = await Promise.all(Object.values(scrapers.actors).map(scraper => scraper.fetchActor(actorName)));
|
||||
|
||||
if (actorEntry) {
|
||||
return updateActor(actorEntry, profiles[0]);
|
||||
}
|
||||
|
||||
return storeActor(profiles[0]);
|
||||
}, {
|
||||
concurrency: 5,
|
||||
});
|
||||
}
|
||||
|
||||
async function storeActors(release, releaseEntry) {
|
||||
const actors = await knex('actors').whereIn('name', release.actors);
|
||||
const newActors = release.actors.filter(actorName => !actors.some(actor => actor.name === actorName));
|
||||
const actorEntries = await knex('actors').whereIn('name', release.actors);
|
||||
|
||||
const { rows: insertedActors } = newActors.length
|
||||
? await knex.raw(`${knex('actors').insert(newActors.map(actorName => ({
|
||||
name: actorName,
|
||||
slug: actorName.toLowerCase().replace(/\s+/g, '-'),
|
||||
})))} ON CONFLICT DO NOTHING RETURNING *`)
|
||||
: { rows: [] };
|
||||
const newActors = release.actors
|
||||
.map(actorName => actorName.trim())
|
||||
.filter(actorName => !actorEntries.some(actor => actor.name === actorName));
|
||||
|
||||
return knex('actors_associated').insert(actors.concat(insertedActors).map(actor => ({
|
||||
release_id: releaseEntry.id,
|
||||
actor_id: actor.id,
|
||||
})), '*');
|
||||
const newActorEntries = await Promise.all(newActors.map(async actorName => storeActor({ name: actorName })));
|
||||
|
||||
await knex('actors_associated')
|
||||
.insert(actorEntries.concat(newActorEntries).map(actor => ({
|
||||
release_id: releaseEntry.id,
|
||||
actor_id: actor.id,
|
||||
})), '*');
|
||||
|
||||
scrapeActors(newActorEntries.map(actor => actor.name));
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchActors,
|
||||
scrapeActors,
|
||||
storeActors,
|
||||
};
|
||||
|
||||
@@ -6,6 +6,7 @@ const initServer = require('./web/server');
|
||||
|
||||
const scrapeSites = require('./scrape-sites');
|
||||
const scrapeRelease = require('./scrape-release');
|
||||
const { scrapeActors } = require('./actors');
|
||||
|
||||
async function init() {
|
||||
if (argv.url) {
|
||||
@@ -23,6 +24,13 @@ async function init() {
|
||||
return;
|
||||
}
|
||||
|
||||
if (argv.actors) {
|
||||
await scrapeActors();
|
||||
knex.destroy();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
await initServer();
|
||||
}
|
||||
|
||||
|
||||
11
src/argv.js
11
src/argv.js
@@ -19,11 +19,21 @@ const { argv } = yargs
|
||||
type: 'array',
|
||||
alias: 'site',
|
||||
})
|
||||
.option('actors', {
|
||||
describe: 'Scrape actors by name or slug',
|
||||
type: 'array',
|
||||
alias: 'actor',
|
||||
})
|
||||
.option('deep', {
|
||||
describe: 'Fetch details for all releases',
|
||||
type: 'boolean',
|
||||
default: true,
|
||||
})
|
||||
.option('redownload', {
|
||||
describe: 'Don\'t ignore duplicates, update existing entries',
|
||||
type: 'boolean',
|
||||
alias: 'force',
|
||||
})
|
||||
.option('url', {
|
||||
describe: 'Scrape scene info from URL',
|
||||
type: 'string',
|
||||
@@ -33,6 +43,7 @@ const { argv } = yargs
|
||||
describe: 'Don\'t fetch scenes older than',
|
||||
type: 'string',
|
||||
default: config.fetchAfter.join(' '),
|
||||
alias: 'limit',
|
||||
})
|
||||
.option('pages', {
|
||||
describe: 'Limit pages to scrape per site. Only used when no dates are found or --after is unset.',
|
||||
|
||||
105
src/releases.js
105
src/releases.js
@@ -100,52 +100,6 @@ function curateScrapedRelease(release) {
|
||||
};
|
||||
}
|
||||
|
||||
async function storeRelease(release) {
|
||||
const curatedRelease = curateScrapedRelease(release);
|
||||
|
||||
const releaseEntries = await knex('releases')
|
||||
.insert(curatedRelease)
|
||||
.returning('*');
|
||||
|
||||
if (releaseEntries.length) {
|
||||
const releaseEntry = releaseEntries[0];
|
||||
|
||||
console.log(`Stored (${release.site.name}, ${releaseEntry.id}) "${release.title}"`);
|
||||
|
||||
await createMediaDirectory(release, releaseEntry.id);
|
||||
|
||||
await Promise.all([
|
||||
storeActors(release, releaseEntry),
|
||||
storeTags(release, releaseEntry),
|
||||
storePhotos(release, releaseEntry),
|
||||
storePoster(release, releaseEntry),
|
||||
storeTrailer(release, releaseEntry),
|
||||
]);
|
||||
|
||||
return releaseEntry.id;
|
||||
}
|
||||
|
||||
console.error(`Unable to save scene to database, possible collision: "${release.title}" (${release.site.name})`);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function storeReleases(releases) {
|
||||
return Promise.map(releases, async (release) => {
|
||||
try {
|
||||
const releaseId = await storeRelease(release);
|
||||
|
||||
return releaseId;
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
|
||||
return null;
|
||||
}
|
||||
}, {
|
||||
concurrency: 2,
|
||||
});
|
||||
}
|
||||
|
||||
function commonQuery(queryBuilder, {
|
||||
filter = [],
|
||||
after = new Date(0), // January 1970
|
||||
@@ -187,6 +141,15 @@ async function fetchReleases(queryObject = {}, options = {}) {
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function fetchReleasesByEntryIds(entryIds, queryObject = {}, options = {}) {
|
||||
const releases = await knex('releases')
|
||||
.modify(commonQuery, options)
|
||||
.whereIn('entry_id', entryIds)
|
||||
.andWhere(builder => whereOr(queryObject, 'releases', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function fetchSiteReleases(queryObject, options = {}) {
|
||||
const releases = await knex('releases')
|
||||
.modify(commonQuery, options)
|
||||
@@ -229,6 +192,56 @@ async function fetchTagReleases(queryObject, options = {}) {
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function storeRelease(release) {
|
||||
const curatedRelease = curateScrapedRelease(release);
|
||||
|
||||
const releaseEntries = await knex('releases')
|
||||
.insert(curatedRelease)
|
||||
.returning('*');
|
||||
|
||||
if (releaseEntries.length) {
|
||||
const releaseEntry = releaseEntries[0];
|
||||
|
||||
console.log(`Stored (${release.site.name}, ${releaseEntry.id}) "${release.title}"`);
|
||||
|
||||
await createMediaDirectory(release, releaseEntry.id);
|
||||
|
||||
await Promise.all([
|
||||
storeActors(release, releaseEntry),
|
||||
storeTags(release, releaseEntry),
|
||||
storePhotos(release, releaseEntry),
|
||||
storePoster(release, releaseEntry),
|
||||
storeTrailer(release, releaseEntry),
|
||||
]);
|
||||
|
||||
return releaseEntry.id;
|
||||
}
|
||||
|
||||
console.error(`Unable to save scene to database, possible collision: "${release.title}" (${release.site.name})`);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function storeReleases(releases) {
|
||||
const existingReleases = await fetchReleasesByEntryIds(releases.map(release => release.entryId));
|
||||
|
||||
console.log(existingReleases);
|
||||
|
||||
return Promise.map(releases, async (release) => {
|
||||
try {
|
||||
const releaseId = await storeRelease(release);
|
||||
|
||||
return releaseId;
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
|
||||
return null;
|
||||
}
|
||||
}, {
|
||||
concurrency: 2,
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchReleases,
|
||||
fetchActorReleases,
|
||||
|
||||
@@ -4,7 +4,7 @@ const config = require('config');
|
||||
|
||||
const argv = require('./argv');
|
||||
const scrapers = require('./scrapers/scrapers');
|
||||
const { storeRelease } = require('./releases');
|
||||
const { storeReleases } = require('./releases');
|
||||
const { findSiteByUrl } = require('./sites');
|
||||
const { findNetworkByUrl } = require('./networks');
|
||||
|
||||
@@ -30,7 +30,7 @@ async function findSite(url, release) {
|
||||
|
||||
async function scrapeRelease(url, release, deep = false) {
|
||||
const site = await findSite(url, release);
|
||||
const scraper = scrapers[site.slug] || scrapers[site.network.slug];
|
||||
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
|
||||
|
||||
if (!site) {
|
||||
throw new Error('Could not find site in database');
|
||||
@@ -48,7 +48,7 @@ async function scrapeRelease(url, release, deep = false) {
|
||||
|
||||
if (!deep && argv.save) {
|
||||
// don't store release when called by site scraper
|
||||
const releaseId = await storeRelease(scene);
|
||||
const releaseId = await storeReleases([scene]);
|
||||
|
||||
console.log(`http://${config.web.host}:${config.web.port}/scene/${releaseId}`);
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ async function scrapeUniqueReleases(scraper, site, afterDate = getAfterDate(), a
|
||||
return [];
|
||||
}
|
||||
|
||||
const duplicateReleaseIds = await findDuplicateReleaseIds(latestReleases, accReleases);
|
||||
const duplicateReleaseIds = argv.redownload ? new Set() : await findDuplicateReleaseIds(latestReleases, accReleases);
|
||||
|
||||
const uniqueReleases = latestReleases
|
||||
.filter(release => !duplicateReleaseIds.has(String(release.entryId)) // release is already in database
|
||||
@@ -107,7 +107,7 @@ async function scrapeReleases() {
|
||||
console.log(`Found ${sites.length} sites in database`);
|
||||
|
||||
await Promise.map(sites, async (site) => {
|
||||
const scraper = scrapers[site.slug] || scrapers[site.network.slug];
|
||||
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
|
||||
|
||||
if (!scraper) {
|
||||
console.warn(`No scraper found for '${site.name}' (${site.slug})`);
|
||||
|
||||
136
src/scrapers/freeones.js
Normal file
136
src/scrapers/freeones.js
Normal file
@@ -0,0 +1,136 @@
|
||||
'use strict';
|
||||
|
||||
/* eslint-disable newline-per-chained-call */
|
||||
const bhttp = require('bhttp');
|
||||
const { JSDOM } = require('jsdom');
|
||||
const moment = require('moment');
|
||||
|
||||
const knex = require('../knex');
|
||||
|
||||
async function scrapeActorFrontpage(html, url, name) {
|
||||
const { document } = new JSDOM(html).window;
|
||||
const bioEl = document.querySelector('.dashboard-bio-list');
|
||||
|
||||
const bioUrl = `https:${document.querySelector('.seemore a').href}`;
|
||||
|
||||
const keys = Array.from(bioEl.querySelectorAll('dt'), el => el.textContent.trim());
|
||||
const values = Array.from(bioEl.querySelectorAll('dd'), el => el.textContent.trim());
|
||||
|
||||
const bio = keys.reduce((acc, key, index) => ({ ...acc, [key]: values[index] }), {});
|
||||
|
||||
const birthdateString = bio['Date of Birth:'];
|
||||
const birthdate = moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate();
|
||||
|
||||
const boobsSizeString = bio['Measurements:'];
|
||||
const boobsSize = boobsSizeString === '??-??-??' ? null : boobsSizeString;
|
||||
const boobsNatural = bio['Fake Boobs:'] === 'No';
|
||||
const active = bio['Career Status:'].trim() === 'Active';
|
||||
|
||||
const residenceCountryName = bio['Country of Origin:'];
|
||||
const countryEntry = await knex('countries').where({ name: residenceCountryName }).first();
|
||||
const residenceCountry = countryEntry ? countryEntry.alpha2 : null;
|
||||
const birthPlace = bio['Place of Birth:'];
|
||||
|
||||
const hair = bio['Hair Color:'].toLowerCase();
|
||||
const eyes = bio['Eye Color:'].toLowerCase();
|
||||
|
||||
const piercingsString = bio['Piercings:'];
|
||||
const piercings = piercingsString === 'None' ? null : piercingsString;
|
||||
|
||||
const tattoosString = bio['Tattoos:'];
|
||||
const tattoos = tattoosString === 'Unknown (add)' || tattoosString === 'None' ? null : tattoosString;
|
||||
|
||||
const social = Array.from(bioEl.querySelectorAll('.dashboard-socialmedia a'), el => el.href);
|
||||
|
||||
return {
|
||||
bio: {
|
||||
name,
|
||||
gender: 'female',
|
||||
birthdate,
|
||||
residenceCountry,
|
||||
birthPlace,
|
||||
boobs: {
|
||||
size: boobsSize,
|
||||
natural: boobsNatural,
|
||||
},
|
||||
hair,
|
||||
eyes,
|
||||
piercings,
|
||||
tattoos,
|
||||
active,
|
||||
social,
|
||||
},
|
||||
url: bioUrl,
|
||||
};
|
||||
}
|
||||
|
||||
async function scrapeActorBio(html, frontpageBio, url, name) {
|
||||
const { document } = new JSDOM(html).window;
|
||||
const bioEl = document.querySelector('#biographyTable');
|
||||
|
||||
const keys = Array.from(bioEl.querySelectorAll('td:nth-child(1)'), el => el.textContent.trim());
|
||||
const values = Array.from(bioEl.querySelectorAll('td:nth-child(2)'), el => el.textContent.trim());
|
||||
|
||||
const bio = keys.reduce((acc, key, index) => ({ ...acc, [key]: values[index] }), {});
|
||||
|
||||
const birthdateString = bio['Date of Birth:'];
|
||||
const birthdate = moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate();
|
||||
const active = bio['Career Status:'].trim() === 'Active';
|
||||
|
||||
const boobsSizeString = bio['Measurements:'];
|
||||
const boobsSize = boobsSizeString === '??-??-??' ? null : boobsSizeString;
|
||||
const boobsNatural = bio['Fake boobs:'] === 'No';
|
||||
const ethnicity = bio['Ethnicity:'];
|
||||
|
||||
const residenceCountryName = bio['Country of Origin:'];
|
||||
const countryEntry = await knex('countries').where({ name: residenceCountryName }).first();
|
||||
const residenceCountry = countryEntry ? countryEntry.alpha2 : null;
|
||||
const birthPlace = bio['Place of Birth:'];
|
||||
|
||||
const hair = bio['Hair Color:'].toLowerCase();
|
||||
const eyes = bio['Eye Color:'].toLowerCase();
|
||||
|
||||
const piercingsString = bio['Piercings:'];
|
||||
const piercings = piercingsString === 'None' ? null : piercingsString;
|
||||
|
||||
const tattoosString = bio['Tattoos:'];
|
||||
const tattoos = tattoosString === undefined || tattoosString === 'Unknown (add)' || tattoosString === 'None' ? null : tattoosString;
|
||||
|
||||
const social = Array.from(bioEl.querySelectorAll('#socialmedia a'), el => el.href);
|
||||
|
||||
return {
|
||||
...frontpageBio,
|
||||
name,
|
||||
gender: 'female',
|
||||
birthdate,
|
||||
residenceCountry,
|
||||
birthPlace,
|
||||
ethnicity,
|
||||
boobs: {
|
||||
size: boobsSize,
|
||||
natural: boobsNatural,
|
||||
},
|
||||
hair,
|
||||
eyes,
|
||||
piercings,
|
||||
tattoos,
|
||||
active,
|
||||
social,
|
||||
};
|
||||
}
|
||||
|
||||
async function fetchActor(actorName) {
|
||||
const slug = actorName.replace(' ', '_');
|
||||
const frontpageUrl = `https://freeones.com/html/v_links/${slug}`;
|
||||
|
||||
const resFrontpage = await bhttp.get(frontpageUrl);
|
||||
const { url, bio } = await scrapeActorFrontpage(resFrontpage.body.toString(), frontpageUrl, actorName);
|
||||
|
||||
const resBio = await bhttp.get(url);
|
||||
|
||||
return scrapeActorBio(resBio.body.toString(), bio, url, actorName);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchActor,
|
||||
};
|
||||
@@ -1,5 +1,6 @@
|
||||
'use strict';
|
||||
|
||||
// releases
|
||||
const twentyonesextury = require('./21sextury');
|
||||
const bangbros = require('./bangbros');
|
||||
const blowpass = require('./blowpass');
|
||||
@@ -19,24 +20,32 @@ const realitykings = require('./realitykings');
|
||||
const vixen = require('./vixen');
|
||||
const xempire = require('./xempire');
|
||||
|
||||
// actors
|
||||
const freeones = require('./freeones');
|
||||
|
||||
module.exports = {
|
||||
'21sextury': twentyonesextury,
|
||||
bangbros,
|
||||
blowpass,
|
||||
brazzers,
|
||||
ddfnetwork,
|
||||
dogfart,
|
||||
dogfartnetwork: dogfart,
|
||||
evilangel,
|
||||
julesjordan,
|
||||
kink,
|
||||
legalporno,
|
||||
mikeadriano,
|
||||
mofos,
|
||||
pervcity,
|
||||
private: privateNetwork,
|
||||
naughtyamerica,
|
||||
realitykings,
|
||||
vixen,
|
||||
xempire,
|
||||
releases: {
|
||||
'21sextury': twentyonesextury,
|
||||
bangbros,
|
||||
blowpass,
|
||||
brazzers,
|
||||
ddfnetwork,
|
||||
dogfart,
|
||||
dogfartnetwork: dogfart,
|
||||
evilangel,
|
||||
julesjordan,
|
||||
kink,
|
||||
legalporno,
|
||||
mikeadriano,
|
||||
mofos,
|
||||
pervcity,
|
||||
private: privateNetwork,
|
||||
naughtyamerica,
|
||||
realitykings,
|
||||
vixen,
|
||||
xempire,
|
||||
},
|
||||
actors: {
|
||||
freeones,
|
||||
},
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user