Refactoring deep scrape. Added tag posters.

This commit is contained in:
ThePendulum 2020-03-16 04:10:52 +01:00
parent c8ebe7892a
commit 0f09fd53eb
31 changed files with 851 additions and 589 deletions

BIN
public/img/tags/anal/1.jpeg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.0 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 96 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 102 KiB

BIN
public/img/tags/mff/0.jpeg Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 MiB

BIN
public/img/tags/mff/0_thumb.jpeg Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 94 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 17 KiB

View File

@ -6,6 +6,7 @@ const tagPosters = [
['bdsm', 0, 'Dani Daniels in "The Traning of Dani Daniels, Day 2" for The Training of O at Kink'], ['bdsm', 0, 'Dani Daniels in "The Traning of Dani Daniels, Day 2" for The Training of O at Kink'],
['behind-the-scenes', 0, 'Janice Griffith in "Day With A Pornstar: Janice" for Brazzers'], ['behind-the-scenes', 0, 'Janice Griffith in "Day With A Pornstar: Janice" for Brazzers'],
['da-tp', 0, 'Natasha Teen in LegalPorno SZ2164'], ['da-tp', 0, 'Natasha Teen in LegalPorno SZ2164'],
['deepthroat', 0, 'Chanel Grey in "Deepthroating Is Fun" for Throated'],
['double-anal', 5, 'Riley Reid in "The Gangbang of Riley Reid" for Jules Jordan'], ['double-anal', 5, 'Riley Reid in "The Gangbang of Riley Reid" for Jules Jordan'],
['double-penetration', 'poster', 'Mia Malkova in "DP!" for HardX'], ['double-penetration', 'poster', 'Mia Malkova in "DP!" for HardX'],
['double-vaginal', 'poster', 'Riley Reid in "Pizza That Ass" for Reid My Lips'], ['double-vaginal', 'poster', 'Riley Reid in "Pizza That Ass" for Reid My Lips'],
@ -13,7 +14,7 @@ const tagPosters = [
['oral-creampie', 1, 'Keisha Grey in Brazzers House'], ['oral-creampie', 1, 'Keisha Grey in Brazzers House'],
['triple-anal', 'poster', 'Kristy Black in SZ1986 for LegalPorno'], ['triple-anal', 'poster', 'Kristy Black in SZ1986 for LegalPorno'],
['airtight', 1, 'Jynx Maze in "Pump My Ass Full of Cum 3" for Jules Jordan'], ['airtight', 1, 'Jynx Maze in "Pump My Ass Full of Cum 3" for Jules Jordan'],
['anal', 'poster', 'Jynx Maze in "Anal Buffet 6" for Evil Angel'], ['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'],
['asian', 'poster', 'Vina Sky in "Young and Glamorous 10" for Jules Jordan'], ['asian', 'poster', 'Vina Sky in "Young and Glamorous 10" for Jules Jordan'],
['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'], ['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'],
['blowbang', 'poster'], ['blowbang', 'poster'],
@ -27,7 +28,7 @@ const tagPosters = [
['gaping', 1, 'Vina Sky in "Vina Sky Does Anal" for HardX'], ['gaping', 1, 'Vina Sky in "Vina Sky Does Anal" for HardX'],
['interracial', 'poster'], ['interracial', 'poster'],
['latina', 'poster'], ['latina', 'poster'],
['mff', 'poster'], ['mff', 0, 'Madison Ivy and Adriana Chechik in "Day With A Pornstar" for Brazzers'],
['mfm', 'poster'], ['mfm', 'poster'],
['orgy', 'poster'], ['orgy', 'poster'],
['schoolgirl', 1, 'Eliza Ibarra for Brazzers'], ['schoolgirl', 1, 'Eliza Ibarra for Brazzers'],
@ -47,6 +48,7 @@ const tagPosters = [
const tagPhotos = [ const tagPhotos = [
['airtight', 2, 'Dakota Skye in "Dakota Goes Nuts" for ArchAngel'], ['airtight', 2, 'Dakota Skye in "Dakota Goes Nuts" for ArchAngel'],
['airtight', 3, 'Anita Bellini in "Triple Dick Gangbang" for Hands On Hardcore (DDF Network)'], ['airtight', 3, 'Anita Bellini in "Triple Dick Gangbang" for Hands On Hardcore (DDF Network)'],
['anal', 'poster', 'Jynx Maze in "Anal Buffet 6" for Evil Angel'],
['anal', 0], ['anal', 0],
['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'], ['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'],
['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'], ['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'],

539
src/actors-legacy.js Normal file
View File

@ -0,0 +1,539 @@
'use strict';
const config = require('config');
const Promise = require('bluebird');
const UrlPattern = require('url-pattern');
const moment = require('moment');
const logger = require('./logger')(__filename);
const knex = require('./knex');
const argv = require('./argv');
const include = require('./utils/argv-include')(argv);
const scrapers = require('./scrapers/scrapers');
const whereOr = require('./utils/where-or');
const resolvePlace = require('./utils/resolve-place');
const slugify = require('./utils/slugify');
const capitalize = require('./utils/capitalize');
const { curateSites } = require('./sites');
const { storeMedia, associateMedia } = require('./media');
async function curateActor(actor) {
const [aliases, avatar, photos, social] = await Promise.all([
knex('actors').where({ alias_for: actor.id }),
knex('actors_avatars')
.where('actor_id', actor.id)
.join('media', 'media.id', 'actors_avatars.media_id')
.first(),
knex('actors_photos')
.where('actor_id', actor.id)
.join('media', 'media.id', 'actors_photos.media_id')
.orderBy('index'),
knex('actors_social')
.where('actor_id', actor.id)
.orderBy('platform', 'desc'),
]);
const curatedActor = {
id: actor.id,
gender: actor.gender,
name: actor.name,
description: actor.description,
birthdate: actor.birthdate && new Date(actor.birthdate),
country: actor.country_alpha2,
origin: (actor.birth_city || actor.birth_state || actor.birth_country_alpha2) ? {} : null,
residence: (actor.residence_city || actor.residence_state || actor.residence_country_alpha2) ? {} : null,
ethnicity: actor.ethnicity,
height: actor.height,
weight: actor.weight,
bust: actor.bust,
waist: actor.waist,
hip: actor.hip,
naturalBoobs: actor.natural_boobs,
aliases: aliases.map(({ name }) => name),
slug: actor.slug,
avatar,
photos,
hasTattoos: actor.has_tattoos,
hasPiercings: actor.has_piercings,
tattoos: actor.tattoos,
piercings: actor.piercings,
social,
scrapedAt: actor.scraped_at,
};
if (curatedActor.birthdate) {
curatedActor.age = moment().diff(curatedActor.birthdate, 'years');
}
if (actor.birth_city) curatedActor.origin.city = actor.birth_city;
if (actor.birth_state) curatedActor.origin.state = actor.birth_state;
if (actor.birth_country_alpha2) {
curatedActor.origin.country = {
alpha2: actor.birth_country_alpha2,
name: actor.birth_country_name,
alias: actor.birth_country_alias,
};
}
if (actor.residence_city) curatedActor.residence.city = actor.residence_city;
if (actor.residence_state) curatedActor.residence.state = actor.residence_state;
if (actor.residence_country_alpha2) {
curatedActor.residence.country = {
alpha2: actor.residence_country_alpha2,
name: actor.residence_country_name,
alias: actor.residence_country_alias,
};
}
return curatedActor;
}
function curateActors(releases) {
return Promise.all(releases.map(async release => curateActor(release)));
}
function curateActorEntry(actor, scraped, scrapeSuccess) {
const curatedActor = {
name: capitalize(actor.name),
slug: slugify(actor.name),
birthdate: actor.birthdate,
description: actor.description,
gender: actor.gender,
ethnicity: actor.ethnicity,
bust: actor.bust,
waist: actor.waist,
hip: actor.hip,
natural_boobs: actor.naturalBoobs,
height: actor.height,
weight: actor.weight,
hair: actor.hair,
eyes: actor.eyes,
has_tattoos: actor.hasTattoos,
has_piercings: actor.hasPiercings,
tattoos: actor.tattoos,
piercings: actor.piercings,
};
if (actor.id) {
curatedActor.id = actor.id;
}
if (actor.birthPlace) {
curatedActor.birth_city = actor.birthPlace.city;
curatedActor.birth_state = actor.birthPlace.state;
curatedActor.birth_country_alpha2 = actor.birthPlace.country;
}
if (actor.residencePlace) {
curatedActor.residence_city = actor.residencePlace.city;
curatedActor.residence_state = actor.residencePlace.state;
curatedActor.residence_country_alpha2 = actor.residencePlace.country;
}
if (scraped) {
curatedActor.scraped_at = new Date();
curatedActor.scrape_success = scrapeSuccess;
}
return curatedActor;
}
function curateSocialEntry(url, actorId) {
const platforms = [
// links supplied by PH often look like domain.com/domain.com/username
{
label: 'twitter',
pattern: 'http(s)\\://(*)twitter.com/:username(/)(?*)',
format: username => `https://www.twitter.com/${username}`,
},
{
label: 'youtube',
pattern: 'http(s)\\://(*)youtube.com/channel/:username(?*)',
format: username => `https://www.youtube.com/channel/${username}`,
},
{
label: 'instagram',
pattern: 'http(s)\\://(*)instagram.com/:username(/)(?*)',
format: username => `https://www.instagram.com/${username}`,
},
{
label: 'snapchat',
pattern: 'http(s)\\://(*)snapchat.com/add/:username(/)(?*)',
format: username => `https://www.snapchat.com/add/${username}`,
},
{
label: 'tumblr',
pattern: 'http(s)\\://:username.tumblr.com(*)',
format: username => `https://${username}.tumblr.com`,
},
{
label: 'onlyfans',
pattern: 'http(s)\\://(*)onlyfans.com/:username(/)(?*)',
format: username => `https://www.onlyfans.com/${username}`,
},
{
label: 'fancentro',
pattern: 'http(s)\\://(*)fancentro.com/:username(/)(?*)',
format: username => `https://www.fancentro.com/${username}`,
},
{
label: 'modelhub',
pattern: 'http(s)\\://(*)modelhub.com/:username(/)(?*)',
format: username => `https://www.modelhub.com/${username}`,
},
];
const match = platforms.reduce((acc, platform) => {
if (acc) return acc;
const patternMatch = new UrlPattern(platform.pattern).match(url);
if (patternMatch) {
return {
platform: platform.label,
original: url,
username: patternMatch.username,
url: platform.format ? platform.format(patternMatch.username) : url,
};
}
return null;
}, null) || { url };
return {
url: match.url,
platform: match.platform,
actor_id: actorId,
};
}
async function curateSocialEntries(urls, actorId) {
if (!urls) {
return [];
}
const existingSocialLinks = await knex('actors_social').where('actor_id', actorId);
return urls.reduce((acc, url) => {
const socialEntry = curateSocialEntry(url, actorId);
if (acc.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase()) || existingSocialLinks.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase())) {
// prevent duplicates
return acc;
}
return [...acc, socialEntry];
}, []);
}
async function fetchActors(queryObject, limit = 100) {
const releases = await knex('actors')
.select(
'actors.*',
'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name', 'birth_countries.alias as birth_country_alias',
'residence_countries.alpha2 as residence_country_alpha2', 'residence_countries.name as residence_country_name', 'residence_countries.alias as residence_country_alias',
)
.leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2')
.leftJoin('countries as residence_countries', 'actors.residence_country_alpha2', 'residence_countries.alpha2')
.orderBy(['actors.name', 'actors.gender'])
.where(builder => whereOr(queryObject, 'actors', builder))
.limit(limit);
return curateActors(releases);
}
async function storeSocialLinks(urls, actorId) {
const curatedSocialEntries = await curateSocialEntries(urls, actorId);
await knex('actors_social').insert(curatedSocialEntries);
}
async function storeAvatars(avatars, actorId) {
if (!avatars || avatars.length === 0) {
return [];
}
const avatarsBySource = await storeMedia(avatars, 'actor', 'avatar');
await associateMedia({ [actorId]: avatars }, avatarsBySource, 'actor', 'photo', 'avatar');
return avatarsBySource;
}
async function storeActor(actor, scraped = false, scrapeSuccess = false) {
const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess);
const [actorEntry] = await knex('actors')
.insert(curatedActor)
.returning('*');
await storeSocialLinks(actor.social, actorEntry.id);
if (actor.avatars) {
await storeAvatars(actor.avatars, actorEntry.id);
}
logger.info(`Added new entry for actor '${actor.name}'`);
return actorEntry;
}
async function updateActor(actor, scraped = false, scrapeSuccess = false) {
const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess);
const [actorEntry] = await knex('actors')
.where({ id: actor.id })
.update(curatedActor)
.returning('*');
await storeSocialLinks(actor.social, actor.id);
logger.info(`Updated entry for actor '${actor.name}'`);
return actorEntry;
}
async function mergeProfiles(profiles, actor) {
if (profiles.filter(Boolean).length === 0) {
return null;
}
const mergedProfile = profiles.reduce((prevProfile, profile) => {
if (profile === null) {
return prevProfile;
}
const accProfile = {
id: actor ? actor.id : null,
name: actor ? actor.name : (prevProfile.name || profile.name),
description: prevProfile.description || profile.description,
gender: prevProfile.gender || profile.gender,
birthdate: !prevProfile.birthdate || Number.isNaN(Number(prevProfile.birthdate)) ? profile.birthdate : prevProfile.birthdate,
birthPlace: prevProfile.birthPlace || profile.birthPlace,
residencePlace: prevProfile.residencePlace || profile.residencePlace,
nationality: prevProfile.nationality || profile.nationality, // used to derive country when not available
ethnicity: prevProfile.ethnicity || profile.ethnicity,
bust: prevProfile.bust || (/\d+\w+/.test(profile.bust) ? profile.bust : null),
waist: prevProfile.waist || profile.waist,
hip: prevProfile.hip || profile.hip,
naturalBoobs: prevProfile.naturalBoobs === undefined ? profile.naturalBoobs : prevProfile.naturalBoobs,
height: prevProfile.height || profile.height,
weight: prevProfile.weight || profile.weight,
hair: prevProfile.hair || profile.hair,
eyes: prevProfile.eyes || profile.eyes,
hasPiercings: prevProfile.hasPiercings === undefined ? profile.hasPiercings : prevProfile.hasPiercings,
hasTattoos: prevProfile.hasTattoos === undefined ? profile.hasTattoos : prevProfile.hasTattoos,
piercings: prevProfile.piercings || profile.piercings,
tattoos: prevProfile.tattoos || profile.tattoos,
social: prevProfile.social.concat(profile.social || []),
releases: prevProfile.releases.concat(profile.releases ? profile.releases : []), // don't flatten fallbacks
};
if (profile.avatar) {
const avatar = Array.isArray(profile.avatar)
? profile.avatar.map(avatarX => ({
src: avatarX.src || avatarX,
scraper: profile.scraper,
copyright: avatarX.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
}))
: {
src: profile.avatar.src || profile.avatar,
scraper: profile.scraper,
copyright: profile.avatar.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
};
accProfile.avatars = prevProfile.avatars.concat([avatar]); // don't flatten fallbacks
} else {
accProfile.avatars = prevProfile.avatars;
}
return accProfile;
}, {
social: [],
avatars: [],
releases: [],
});
const [birthPlace, residencePlace] = await Promise.all([
resolvePlace(mergedProfile.birthPlace),
resolvePlace(mergedProfile.residencePlace),
]);
mergedProfile.birthPlace = birthPlace;
mergedProfile.residencePlace = residencePlace;
if (!mergedProfile.birthPlace && mergedProfile.nationality) {
const country = await knex('countries')
.where('nationality', 'ilike', `%${mergedProfile.nationality}%`)
.orderBy('priority', 'desc')
.first();
mergedProfile.birthPlace = {
country: country.alpha2,
};
}
return mergedProfile;
}
async function scrapeProfiles(sources, actorName, actorEntry, sitesBySlug) {
return Promise.map(sources, async (source) => {
// const [scraperSlug, scraper] = source;
const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] }));
try {
return await profileScrapers.reduce(async (outcome, { scraper, scraperSlug }) => outcome.catch(async () => {
if (!scraper) {
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
throw Object.assign(new Error(`No profile scraper available for ${scraperSlug}`));
}
logger.verbose(`Searching '${actorName}' on ${scraperSlug}`);
const site = sitesBySlug[scraperSlug] || null;
const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, site, include);
if (profile) {
logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`);
return {
...profile,
name: actorName,
scraper: scraperSlug,
site,
releases: profile.releases?.map(release => (typeof release === 'string'
? { url: release, site }
: { ...release, site: release.site || site }
)),
};
}
logger.verbose(`No profile for '${actorName}' available on ${scraperSlug}`);
throw Object.assign(new Error(`Profile for ${actorName} not available on ${scraperSlug}`), { warn: false });
}), Promise.reject(new Error()));
} catch (error) {
if (error.warn !== false) {
logger.warn(`Error in scraper ${source}: ${error.message}`);
// logger.error(error.stack);
}
}
return null;
});
}
async function scrapeActors(actorNames) {
return Promise.map(actorNames || argv.actors, async (actorName) => {
try {
const actorSlug = slugify(actorName);
const actorEntry = await knex('actors').where({ slug: actorSlug }).first();
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested
const [siteEntries, networkEntries] = await Promise.all([
knex('sites')
.leftJoin('networks', 'sites.network_id', 'networks.id')
.select(
'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
)
.whereIn('sites.slug', finalSources.flat()),
knex('networks').select('*').whereIn('slug', finalSources.flat()),
]);
const sites = await curateSites(siteEntries, true);
const networks = networkEntries.map(network => ({ ...network, isFallback: true }));
const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
const profiles = await scrapeProfiles(sources, actorName, actorEntry, sitesBySlug);
const profile = await mergeProfiles(profiles, actorEntry);
if (profile === null) {
logger.warn(`Could not find profile for actor '${actorName}'`);
if (argv.save && !actorEntry) {
await storeActor({ name: actorName }, false, false);
}
return null;
}
if (argv.inspect) {
console.log(profile);
logger.info(`Found ${profile.releases.length} releases for ${actorName}`);
}
if (argv.save) {
if (actorEntry && profile) {
await Promise.all([
updateActor(profile, true, true),
storeAvatars(profile.avatars, actorEntry.id),
]);
return profile;
}
await storeActor(profile, true, true);
}
return profile;
} catch (error) {
console.log(error);
logger.warn(`${actorName}: ${error}`);
return null;
}
}, {
concurrency: 3,
});
}
async function scrapeBasicActors() {
const basicActors = await knex('actors').where('scraped_at', null);
return scrapeActors(basicActors.map(actor => actor.name));
}
async function associateActors(mappedActors, releases) {
const [existingActorEntries, existingAssociationEntries] = await Promise.all([
knex('actors')
.whereIn('name', Object.values(mappedActors).map(actor => actor.name))
.orWhereIn('slug', Object.keys(mappedActors)),
knex('releases_actors').whereIn('release_id', releases.map(release => release.id)),
]);
const associations = await Promise.map(Object.entries(mappedActors), async ([actorSlug, actor]) => {
try {
const actorEntry = existingActorEntries.find(actorX => actorX.slug === actorSlug)
|| await storeActor(actor);
// if a scene
return Array.from(actor.releaseIds)
.map(releaseId => ({
release_id: releaseId,
actor_id: actorEntry.id,
}))
.filter(association => !existingAssociationEntries
// remove associations already in database
.some(associationEntry => associationEntry.actor_id === association.actor_id
&& associationEntry.release_id === association.release_id));
} catch (error) {
logger.error(actor.name, error);
return null;
}
});
await knex('releases_actors').insert(associations.filter(association => association).flat());
// basic actor scraping is failure prone, don't run together with actor association
// await scrapebasicactors(),
}
module.exports = {
associateActors,
fetchActors,
scrapeActors,
scrapeBasicActors,
};

View File

@ -1,539 +1,26 @@
'use strict'; 'use strict';
const config = require('config');
const Promise = require('bluebird');
const UrlPattern = require('url-pattern');
const moment = require('moment');
const logger = require('./logger')(__filename);
const knex = require('./knex');
const argv = require('./argv');
const include = require('./utils/argv-include')(argv);
const scrapers = require('./scrapers/scrapers');
const whereOr = require('./utils/where-or');
const resolvePlace = require('./utils/resolve-place');
const slugify = require('./utils/slugify'); const slugify = require('./utils/slugify');
const capitalize = require('./utils/capitalize');
const { curateSites } = require('./sites');
const { storeMedia, associateMedia } = require('./media');
async function curateActor(actor) { async function storeReleaseActors(releases) {
const [aliases, avatar, photos, social] = await Promise.all([ const releaseIdsByActor = releases.reduce(
knex('actors').where({ alias_for: actor.id }), (acc, release) => release.actors.reduce((actorAcc, actor) => {
knex('actors_avatars') const releaseActor = actor.name ? actor : { name: actor };
.where('actor_id', actor.id) const actorSlug = slugify(releaseActor.name);
.join('media', 'media.id', 'actors_avatars.media_id')
.first(),
knex('actors_photos')
.where('actor_id', actor.id)
.join('media', 'media.id', 'actors_photos.media_id')
.orderBy('index'),
knex('actors_social')
.where('actor_id', actor.id)
.orderBy('platform', 'desc'),
]);
const curatedActor = {
id: actor.id,
gender: actor.gender,
name: actor.name,
description: actor.description,
birthdate: actor.birthdate && new Date(actor.birthdate),
country: actor.country_alpha2,
origin: (actor.birth_city || actor.birth_state || actor.birth_country_alpha2) ? {} : null,
residence: (actor.residence_city || actor.residence_state || actor.residence_country_alpha2) ? {} : null,
ethnicity: actor.ethnicity,
height: actor.height,
weight: actor.weight,
bust: actor.bust,
waist: actor.waist,
hip: actor.hip,
naturalBoobs: actor.natural_boobs,
aliases: aliases.map(({ name }) => name),
slug: actor.slug,
avatar,
photos,
hasTattoos: actor.has_tattoos,
hasPiercings: actor.has_piercings,
tattoos: actor.tattoos,
piercings: actor.piercings,
social,
scrapedAt: actor.scraped_at,
};
if (curatedActor.birthdate) {
curatedActor.age = moment().diff(curatedActor.birthdate, 'years');
}
if (actor.birth_city) curatedActor.origin.city = actor.birth_city;
if (actor.birth_state) curatedActor.origin.state = actor.birth_state;
if (actor.birth_country_alpha2) {
curatedActor.origin.country = {
alpha2: actor.birth_country_alpha2,
name: actor.birth_country_name,
alias: actor.birth_country_alias,
};
}
if (actor.residence_city) curatedActor.residence.city = actor.residence_city;
if (actor.residence_state) curatedActor.residence.state = actor.residence_state;
if (actor.residence_country_alpha2) {
curatedActor.residence.country = {
alpha2: actor.residence_country_alpha2,
name: actor.residence_country_name,
alias: actor.residence_country_alias,
};
}
return curatedActor;
}
function curateActors(releases) {
return Promise.all(releases.map(async release => curateActor(release)));
}
function curateActorEntry(actor, scraped, scrapeSuccess) {
const curatedActor = {
name: capitalize(actor.name),
slug: slugify(actor.name),
birthdate: actor.birthdate,
description: actor.description,
gender: actor.gender,
ethnicity: actor.ethnicity,
bust: actor.bust,
waist: actor.waist,
hip: actor.hip,
natural_boobs: actor.naturalBoobs,
height: actor.height,
weight: actor.weight,
hair: actor.hair,
eyes: actor.eyes,
has_tattoos: actor.hasTattoos,
has_piercings: actor.hasPiercings,
tattoos: actor.tattoos,
piercings: actor.piercings,
};
if (actor.id) {
curatedActor.id = actor.id;
}
if (actor.birthPlace) {
curatedActor.birth_city = actor.birthPlace.city;
curatedActor.birth_state = actor.birthPlace.state;
curatedActor.birth_country_alpha2 = actor.birthPlace.country;
}
if (actor.residencePlace) {
curatedActor.residence_city = actor.residencePlace.city;
curatedActor.residence_state = actor.residencePlace.state;
curatedActor.residence_country_alpha2 = actor.residencePlace.country;
}
if (scraped) {
curatedActor.scraped_at = new Date();
curatedActor.scrape_success = scrapeSuccess;
}
return curatedActor;
}
function curateSocialEntry(url, actorId) {
const platforms = [
// links supplied by PH often look like domain.com/domain.com/username
{
label: 'twitter',
pattern: 'http(s)\\://(*)twitter.com/:username(/)(?*)',
format: username => `https://www.twitter.com/${username}`,
},
{
label: 'youtube',
pattern: 'http(s)\\://(*)youtube.com/channel/:username(?*)',
format: username => `https://www.youtube.com/channel/${username}`,
},
{
label: 'instagram',
pattern: 'http(s)\\://(*)instagram.com/:username(/)(?*)',
format: username => `https://www.instagram.com/${username}`,
},
{
label: 'snapchat',
pattern: 'http(s)\\://(*)snapchat.com/add/:username(/)(?*)',
format: username => `https://www.snapchat.com/add/${username}`,
},
{
label: 'tumblr',
pattern: 'http(s)\\://:username.tumblr.com(*)',
format: username => `https://${username}.tumblr.com`,
},
{
label: 'onlyfans',
pattern: 'http(s)\\://(*)onlyfans.com/:username(/)(?*)',
format: username => `https://www.onlyfans.com/${username}`,
},
{
label: 'fancentro',
pattern: 'http(s)\\://(*)fancentro.com/:username(/)(?*)',
format: username => `https://www.fancentro.com/${username}`,
},
{
label: 'modelhub',
pattern: 'http(s)\\://(*)modelhub.com/:username(/)(?*)',
format: username => `https://www.modelhub.com/${username}`,
},
];
const match = platforms.reduce((acc, platform) => {
if (acc) return acc;
const patternMatch = new UrlPattern(platform.pattern).match(url);
if (patternMatch) {
return {
platform: platform.label,
original: url,
username: patternMatch.username,
url: platform.format ? platform.format(patternMatch.username) : url,
};
}
return null;
}, null) || { url };
return { return {
url: match.url, ...actorAcc,
platform: match.platform, [actorSlug]: actorAcc[actorSlug]
actor_id: actorId, ? actorAcc[actorSlug].concat(release.id)
: [release.id],
}; };
} }, acc),
{},
);
async function curateSocialEntries(urls, actorId) { console.log(releaseIdsByActor);
if (!urls) {
return [];
}
const existingSocialLinks = await knex('actors_social').where('actor_id', actorId);
return urls.reduce((acc, url) => {
const socialEntry = curateSocialEntry(url, actorId);
if (acc.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase()) || existingSocialLinks.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase())) {
// prevent duplicates
return acc;
}
return [...acc, socialEntry];
}, []);
}
async function fetchActors(queryObject, limit = 100) {
const releases = await knex('actors')
.select(
'actors.*',
'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name', 'birth_countries.alias as birth_country_alias',
'residence_countries.alpha2 as residence_country_alpha2', 'residence_countries.name as residence_country_name', 'residence_countries.alias as residence_country_alias',
)
.leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2')
.leftJoin('countries as residence_countries', 'actors.residence_country_alpha2', 'residence_countries.alpha2')
.orderBy(['actors.name', 'actors.gender'])
.where(builder => whereOr(queryObject, 'actors', builder))
.limit(limit);
return curateActors(releases);
}
async function storeSocialLinks(urls, actorId) {
const curatedSocialEntries = await curateSocialEntries(urls, actorId);
await knex('actors_social').insert(curatedSocialEntries);
}
async function storeAvatars(avatars, actorId) {
if (!avatars || avatars.length === 0) {
return [];
}
const avatarsBySource = await storeMedia(avatars, 'actor', 'avatar');
await associateMedia({ [actorId]: avatars }, avatarsBySource, 'actor', 'photo', 'avatar');
return avatarsBySource;
}
async function storeActor(actor, scraped = false, scrapeSuccess = false) {
const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess);
const [actorEntry] = await knex('actors')
.insert(curatedActor)
.returning('*');
await storeSocialLinks(actor.social, actorEntry.id);
if (actor.avatars) {
await storeAvatars(actor.avatars, actorEntry.id);
}
logger.info(`Added new entry for actor '${actor.name}'`);
return actorEntry;
}
async function updateActor(actor, scraped = false, scrapeSuccess = false) {
const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess);
const [actorEntry] = await knex('actors')
.where({ id: actor.id })
.update(curatedActor)
.returning('*');
await storeSocialLinks(actor.social, actor.id);
logger.info(`Updated entry for actor '${actor.name}'`);
return actorEntry;
}
async function mergeProfiles(profiles, actor) {
if (profiles.filter(Boolean).length === 0) {
return null;
}
const mergedProfile = profiles.reduce((prevProfile, profile) => {
if (profile === null) {
return prevProfile;
}
const accProfile = {
id: actor ? actor.id : null,
name: actor ? actor.name : (prevProfile.name || profile.name),
description: prevProfile.description || profile.description,
gender: prevProfile.gender || profile.gender,
birthdate: !prevProfile.birthdate || Number.isNaN(Number(prevProfile.birthdate)) ? profile.birthdate : prevProfile.birthdate,
birthPlace: prevProfile.birthPlace || profile.birthPlace,
residencePlace: prevProfile.residencePlace || profile.residencePlace,
nationality: prevProfile.nationality || profile.nationality, // used to derive country when not available
ethnicity: prevProfile.ethnicity || profile.ethnicity,
bust: prevProfile.bust || (/\d+\w+/.test(profile.bust) ? profile.bust : null),
waist: prevProfile.waist || profile.waist,
hip: prevProfile.hip || profile.hip,
naturalBoobs: prevProfile.naturalBoobs === undefined ? profile.naturalBoobs : prevProfile.naturalBoobs,
height: prevProfile.height || profile.height,
weight: prevProfile.weight || profile.weight,
hair: prevProfile.hair || profile.hair,
eyes: prevProfile.eyes || profile.eyes,
hasPiercings: prevProfile.hasPiercings === undefined ? profile.hasPiercings : prevProfile.hasPiercings,
hasTattoos: prevProfile.hasTattoos === undefined ? profile.hasTattoos : prevProfile.hasTattoos,
piercings: prevProfile.piercings || profile.piercings,
tattoos: prevProfile.tattoos || profile.tattoos,
social: prevProfile.social.concat(profile.social || []),
releases: prevProfile.releases.concat(profile.releases ? profile.releases : []), // don't flatten fallbacks
};
if (profile.avatar) {
const avatar = Array.isArray(profile.avatar)
? profile.avatar.map(avatarX => ({
src: avatarX.src || avatarX,
scraper: profile.scraper,
copyright: avatarX.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
}))
: {
src: profile.avatar.src || profile.avatar,
scraper: profile.scraper,
copyright: profile.avatar.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
};
accProfile.avatars = prevProfile.avatars.concat([avatar]); // don't flatten fallbacks
} else {
accProfile.avatars = prevProfile.avatars;
}
return accProfile;
}, {
social: [],
avatars: [],
releases: [],
});
const [birthPlace, residencePlace] = await Promise.all([
resolvePlace(mergedProfile.birthPlace),
resolvePlace(mergedProfile.residencePlace),
]);
mergedProfile.birthPlace = birthPlace;
mergedProfile.residencePlace = residencePlace;
if (!mergedProfile.birthPlace && mergedProfile.nationality) {
const country = await knex('countries')
.where('nationality', 'ilike', `%${mergedProfile.nationality}%`)
.orderBy('priority', 'desc')
.first();
mergedProfile.birthPlace = {
country: country.alpha2,
};
}
return mergedProfile;
}
async function scrapeProfiles(sources, actorName, actorEntry, sitesBySlug) {
return Promise.map(sources, async (source) => {
// const [scraperSlug, scraper] = source;
const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] }));
try {
return await profileScrapers.reduce(async (outcome, { scraper, scraperSlug }) => outcome.catch(async () => {
if (!scraper) {
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
throw Object.assign(new Error(`No profile scraper available for ${scraperSlug}`));
}
logger.verbose(`Searching '${actorName}' on ${scraperSlug}`);
const site = sitesBySlug[scraperSlug] || null;
const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, site, include);
if (profile) {
logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`);
return {
...profile,
name: actorName,
scraper: scraperSlug,
site,
releases: profile.releases?.map(release => (typeof release === 'string'
? { url: release, site }
: { ...release, site: release.site || site }
)),
};
}
logger.verbose(`No profile for '${actorName}' available on ${scraperSlug}`);
throw Object.assign(new Error(`Profile for ${actorName} not available on ${scraperSlug}`), { warn: false });
}), Promise.reject(new Error()));
} catch (error) {
if (error.warn !== false) {
logger.warn(`Error in scraper ${source}: ${error.message}`);
// logger.error(error.stack);
}
}
return null;
});
}
async function scrapeActors(actorNames) {
return Promise.map(actorNames || argv.actors, async (actorName) => {
try {
const actorSlug = slugify(actorName);
const actorEntry = await knex('actors').where({ slug: actorSlug }).first();
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested
const [siteEntries, networkEntries] = await Promise.all([
knex('sites')
.leftJoin('networks', 'sites.network_id', 'networks.id')
.select(
'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
)
.whereIn('sites.slug', finalSources.flat()),
knex('networks').select('*').whereIn('slug', finalSources.flat()),
]);
const sites = await curateSites(siteEntries, true);
const networks = networkEntries.map(network => ({ ...network, isFallback: true }));
const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
const profiles = await scrapeProfiles(sources, actorName, actorEntry, sitesBySlug);
const profile = await mergeProfiles(profiles, actorEntry);
if (profile === null) {
logger.warn(`Could not find profile for actor '${actorName}'`);
if (argv.save && !actorEntry) {
await storeActor({ name: actorName }, false, false);
}
return null;
}
if (argv.inspect) {
console.log(profile);
logger.info(`Found ${profile.releases.length} releases for ${actorName}`);
}
if (argv.save) {
if (actorEntry && profile) {
await Promise.all([
updateActor(profile, true, true),
storeAvatars(profile.avatars, actorEntry.id),
]);
return profile;
}
await storeActor(profile, true, true);
}
return profile;
} catch (error) {
console.log(error);
logger.warn(`${actorName}: ${error}`);
return null;
}
}, {
concurrency: 3,
});
}
async function scrapeBasicActors() {
const basicActors = await knex('actors').where('scraped_at', null);
return scrapeActors(basicActors.map(actor => actor.name));
}
async function associateActors(mappedActors, releases) {
const [existingActorEntries, existingAssociationEntries] = await Promise.all([
knex('actors')
.whereIn('name', Object.values(mappedActors).map(actor => actor.name))
.orWhereIn('slug', Object.keys(mappedActors)),
knex('releases_actors').whereIn('release_id', releases.map(release => release.id)),
]);
const associations = await Promise.map(Object.entries(mappedActors), async ([actorSlug, actor]) => {
try {
const actorEntry = existingActorEntries.find(actorX => actorX.slug === actorSlug)
|| await storeActor(actor);
// if a scene
return Array.from(actor.releaseIds)
.map(releaseId => ({
release_id: releaseId,
actor_id: actorEntry.id,
}))
.filter(association => !existingAssociationEntries
// remove associations already in database
.some(associationEntry => associationEntry.actor_id === association.actor_id
&& associationEntry.release_id === association.release_id));
} catch (error) {
logger.error(actor.name, error);
return null;
}
});
await knex('releases_actors').insert(associations.filter(association => association).flat());
// basic actor scraping is failure prone, don't run together with actor association
// await scrapebasicactors(),
} }
module.exports = { module.exports = {
associateActors, storeReleaseActors,
fetchActors,
scrapeActors,
scrapeBasicActors,
}; };

View File

@ -5,7 +5,10 @@ const argv = require('./argv');
const initServer = require('./web/server'); const initServer = require('./web/server');
const knex = require('./knex'); const knex = require('./knex');
const fetchUpdates = require('./fetch-updates'); const fetchUpdates = require('./updates');
const fetchDeep = require('./deep');
const { storeReleases } = require('./store-releases');
// const { storeReleaseActors } = require('./actors');
async function init() { async function init() {
if (argv.server) { if (argv.server) {
@ -13,7 +16,15 @@ async function init() {
return; return;
} }
await fetchUpdates(); const updateBaseReleases = (argv.scrape || argv.sites || argv.networks) && await fetchUpdates();
const updateDeepReleases = updateBaseReleases && await fetchDeep(updateBaseReleases);
const argvDeepReleases = argv.scenes && await fetchDeep(argv.scenes);
await storeReleases([...(updateDeepReleases || []), ...(argvDeepReleases || [])]);
// await storeReleaseActors(updateReleases);
knex.destroy(); knex.destroy();
} }

145
src/deep.js Normal file
View File

@ -0,0 +1,145 @@
'use strict';
const argv = require('./argv');
const logger = require('./logger')(__filename);
const knex = require('./knex');
const scrapers = require('./scrapers/scrapers');
const { curateSites } = require('./sites');
const { curateNetworks } = require('./networks');
function urlToSiteSlug(url) {
try {
const slug = new URL(url)
.hostname
.match(/([\w-]+)\.\w+$/)?.[1];
return slug;
} catch (error) {
logger.warn(`Failed to derive site slug from '${url}': ${error.message}`);
return null;
}
}
async function findSites(baseReleases) {
const baseReleasesWithoutSite = baseReleases.filter(release => release.url && !release.site);
const siteSlugs = Array.from(new Set(
baseReleasesWithoutSite
.map(baseRelease => urlToSiteSlug(baseRelease.url))
.filter(Boolean),
));
const siteEntries = await knex('sites').whereIn('slug', siteSlugs);
const networkEntries = await knex('networks').whereIn('slug', siteSlugs);
const sites = await curateSites(siteEntries, true, false);
const networks = await curateNetworks(networkEntries, true, false, false);
const markedNetworks = networks.map(network => ({ ...network, isFallback: true }));
const sitesBySlug = []
.concat(sites, markedNetworks)
.reduce((accSites, site) => ({ ...accSites, [site.slug]: site }), {});
return sitesBySlug;
}
function toBaseReleases(baseReleasesOrUrls) {
return baseReleasesOrUrls
.map((baseReleaseOrUrl) => {
if (baseReleaseOrUrl.url) {
// base release with URL
return {
...baseReleaseOrUrl,
deep: false,
};
}
if (/^http/.test(baseReleaseOrUrl)) {
// URL
return {
url: baseReleaseOrUrl,
deep: false,
};
}
if (typeof baseReleaseOrUrl === 'object' && !Array.isArray(baseReleaseOrUrl)) {
// base release without URL, prepare for passthrough
return {
...baseReleaseOrUrl,
deep: false,
};
}
logger.warn(`Malformed base release, discarding '${baseReleaseOrUrl}'`);
return null;
})
.filter(Boolean);
}
async function scrapeRelease(baseRelease, sites, type = 'scene') {
const site = baseRelease.site || sites[urlToSiteSlug(baseRelease.url)];
if (!site) {
logger.warn(`No site available for ${baseRelease.url}`);
return baseRelease;
}
if ((!baseRelease.url && !baseRelease.path) || !argv.deep) {
return {
...baseRelease,
site,
};
}
const scraper = scrapers.releases[site.slug];
if (!scraper) {
logger.warn(`Could not find scraper for ${baseRelease.url}`);
return baseRelease;
}
if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) {
logger.warn(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
return baseRelease;
}
try {
const scrapedRelease = type === 'scene'
? await scraper.fetchScene(baseRelease.url, site, baseRelease)
: await scraper.fetchMovie(baseRelease.url, site, baseRelease);
const mergedRelease = {
...baseRelease,
...scrapedRelease,
deep: !!scrapedRelease,
site,
};
if (scrapedRelease && baseRelease?.tags) {
mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags);
}
console.log(mergedRelease);
return mergedRelease;
} catch (error) {
logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`);
return baseRelease;
}
}
async function scrapeReleases(baseReleases, sites) {
return Promise.all(baseReleases.map(baseRelease => scrapeRelease(baseRelease, sites)));
}
async function fetchReleases(baseReleasesOrUrls) {
const baseReleases = toBaseReleases(baseReleasesOrUrls);
const sites = await findSites(baseReleases);
const deepReleases = await scrapeReleases(baseReleases, sites);
return deepReleases;
}
module.exports = fetchReleases;

View File

@ -4,29 +4,33 @@ const knex = require('./knex');
const whereOr = require('./utils/where-or'); const whereOr = require('./utils/where-or');
const { fetchSites } = require('./sites'); const { fetchSites } = require('./sites');
async function curateNetwork(network, includeParameters = false) { async function curateNetwork(network, includeParameters = false, includeSites = true, includeStudios = false) {
const [sites, studios] = await Promise.all([ const curatedNetwork = {
fetchSites({ network_id: network.id }),
knex('studios')
.where({ network_id: network.id }),
]);
return {
id: network.id, id: network.id,
name: network.name, name: network.name,
url: network.url, url: network.url,
description: network.description, description: network.description,
slug: network.slug, slug: network.slug,
sites,
parameters: includeParameters ? network.parameters : null, parameters: includeParameters ? network.parameters : null,
studios: studios.map(studio => ({ };
if (includeSites) {
curatedNetwork.sites = await fetchSites({ network_id: network.id });
}
if (includeStudios) {
const studios = await knex('studios').where({ network_id: network.id });
curatedNetwork.studios = studios.map(studio => ({
id: studio.id, id: studio.id,
name: studio.name, name: studio.name,
url: studio.url, url: studio.url,
description: studio.description, description: studio.description,
slug: studio.slug, slug: studio.slug,
})), }));
}; }
return curatedNetwork;
} }
function curateNetworks(releases) { function curateNetworks(releases) {
@ -69,6 +73,8 @@ async function fetchNetworksFromReleases() {
} }
module.exports = { module.exports = {
curateNetwork,
curateNetworks,
fetchNetworks, fetchNetworks,
fetchNetworksFromReleases, fetchNetworksFromReleases,
findNetworkByUrl, findNetworkByUrl,

View File

@ -15,7 +15,7 @@ const {
storeMedia, storeMedia,
associateMedia, associateMedia,
} = require('./media'); } = require('./media');
const { fetchSites, findSiteByUrl } = require('./sites'); const { fetchSites } = require('./sites');
const slugify = require('./utils/slugify'); const slugify = require('./utils/slugify');
const capitalize = require('./utils/capitalize'); const capitalize = require('./utils/capitalize');
@ -174,16 +174,7 @@ async function attachChannelSite(release) {
}; };
} }
try { throw new Error(`Unable to match channel '${release.channel.slug || release.channel}' from generic URL: ${release.url}`);
const urlSite = await findSiteByUrl(release.channel.url || release.channel);
return {
...release,
site: urlSite,
};
} catch (error) {
throw new Error(`Unable to derive channel site from generic URL: ${release.url}`);
}
} }
async function attachStudio(release) { async function attachStudio(release) {

View File

@ -90,7 +90,7 @@ async function scrapeProfile({ qu }, site, withScenes) {
const bio = qu.all('.stats li', true).reduce((acc, row) => { const bio = qu.all('.stats li', true).reduce((acc, row) => {
const [key, value] = row.split(':'); const [key, value] = row.split(':');
return { ...acc, [slugify(key, { delimiter: '_' })]: value.trim() }; return { ...acc, [slugify(key, '_')]: value.trim() };
}, {}); }, {});
if (bio.height) profile.height = feetInchesToCm(bio.height); if (bio.height) profile.height = feetInchesToCm(bio.height);
@ -133,7 +133,7 @@ async function fetchScene(url, site) {
} }
async function fetchProfile(actorName, scraperSlug, site, include) { async function fetchProfile(actorName, scraperSlug, site, include) {
const actorSlugA = slugify(actorName, { delimiter: '' }); const actorSlugA = slugify(actorName, '');
const actorSlugB = slugify(actorName); const actorSlugB = slugify(actorName);
const resA = await get(`${site.url}/models/${actorSlugA}.html`); const resA = await get(`${site.url}/models/${actorSlugA}.html`);

View File

@ -43,7 +43,7 @@ function scrapeAll(html, site, upcoming) {
const poster = `https:${$(element).find('.card-main-img').attr('data-src')}`; const poster = `https:${$(element).find('.card-main-img').attr('data-src')}`;
const photos = $(element).find('.card-overlay .image-under').map((photoIndex, photoElement) => `https:${$(photoElement).attr('data-src')}`).toArray(); const photos = $(element).find('.card-overlay .image-under').map((photoIndex, photoElement) => `https:${$(photoElement).attr('data-src')}`).toArray();
const channel = slugify($(element).find('.collection').attr('title'), { delimiter: '' }); const channel = slugify($(element).find('.collection').attr('title'), '');
return acc.concat({ return acc.concat({
url, url,

View File

@ -61,7 +61,7 @@ function scrapeProfile({ q, qa, qtx }) {
const keys = qa('.model-descr_line:not(.model-descr_rait) p.text span', true); const keys = qa('.model-descr_line:not(.model-descr_rait) p.text span', true);
const values = qa('.model-descr_line:not(.model-descr_rait) p.text').map(el => qtx(el)); const values = qa('.model-descr_line:not(.model-descr_rait) p.text').map(el => qtx(el));
const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, { delimiter: '_' })]: values[index] }), {}); const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, '_')]: values[index] }), {});
if (bio.height) profile.height = Number(bio.height.match(/\((\d+)cm\)/)[1]); if (bio.height) profile.height = Number(bio.height.match(/\((\d+)cm\)/)[1]);
if (bio.weight) profile.weight = Number(bio.weight.match(/\((\d+)kg\)/)[1]); if (bio.weight) profile.weight = Number(bio.weight.match(/\((\d+)kg\)/)[1]);
@ -122,7 +122,7 @@ async function fetchScene(url, site, release) {
async function fetchProfile(actorName, scraperSlug) { async function fetchProfile(actorName, scraperSlug) {
const actorSlug = slugify(actorName); const actorSlug = slugify(actorName);
const actorSlug2 = slugify(actorName, { delimiter: '' }); const actorSlug2 = slugify(actorName, '');
const [url, url2] = ['cherrypimps', 'wildoncam'].includes(scraperSlug) const [url, url2] = ['cherrypimps', 'wildoncam'].includes(scraperSlug)
? [`https://${scraperSlug}.com/models/${actorSlug}.html`, `https://${scraperSlug}.com/models/${actorSlug2}.html`] ? [`https://${scraperSlug}.com/models/${actorSlug}.html`, `https://${scraperSlug}.com/models/${actorSlug2}.html`]

View File

@ -74,7 +74,7 @@ async function fetchActorReleases(urls) {
async function scrapeProfile(html, _url, actorName) { async function scrapeProfile(html, _url, actorName) {
const { qu } = ex(html); const { qu } = ex(html);
const keys = qu.all('.about-title', true).map(key => slugify(key, { delimiter: '_' })); const keys = qu.all('.about-title', true).map(key => slugify(key, '_'));
const values = qu.all('.about-info').map((el) => { const values = qu.all('.about-info').map((el) => {
if (el.children.length > 0) { if (el.children.length > 0) {
return Array.from(el.children, child => child.textContent.trim()).join(', '); return Array.from(el.children, child => child.textContent.trim()).join(', ');

View File

@ -79,7 +79,7 @@ async function fetchScene(url, site) {
} }
async function fetchProfile(actorName, scraperSlug) { async function fetchProfile(actorName, scraperSlug) {
const actorSlug = slugify(actorName, { delimiter: '' }); const actorSlug = slugify(actorName, '');
const url = scraperSlug === 'povperverts' const url = scraperSlug === 'povperverts'
? `https://povperverts.net/models/${actorSlug}.html` ? `https://povperverts.net/models/${actorSlug}.html`
: `https://${scraperSlug}.com/models/${actorSlug}.html`; : `https://${scraperSlug}.com/models/${actorSlug}.html`;

View File

@ -233,7 +233,7 @@ async function scrapeScene(html, url, site, baseRelease, mobileHtml) {
release.tags = hasTrans ? [...rawTags, 'transsexual'] : rawTags; release.tags = hasTrans ? [...rawTags, 'transsexual'] : rawTags;
const channel = data?.productionCompany?.name || $('.studioLink a, .siteLink a').attr('title')?.trim() || $('.siteNameSpan').text()?.trim().toLowerCase().replace('.com', ''); const channel = data?.productionCompany?.name || $('.studioLink a, .siteLink a').attr('title')?.trim() || $('.siteNameSpan').text()?.trim().toLowerCase().replace('.com', '');
if (channel) release.channel = slugify(channel, { delimiter: '' }); if (channel) release.channel = slugify(channel, '');
if (videoData.picPreview && new URL(videoData.picPreview).pathname.length > 1) release.poster = videoData.picPreview; // sometimes links to just https://images02-fame.gammacdn.com/ if (videoData.picPreview && new URL(videoData.picPreview).pathname.length > 1) release.poster = videoData.picPreview; // sometimes links to just https://images02-fame.gammacdn.com/

View File

@ -193,7 +193,7 @@ function scrapeSceneT1({ html, qu }, site, url, baseRelease, channelRegExp) {
if (channel) { if (channel) {
release.channel = { release.channel = {
force: true, force: true,
slug: slugify(channel, { delimiter: '' }), slug: slugify(channel, ''),
}; };
} }
} }
@ -239,7 +239,7 @@ function scrapeProfile({ el, qu }, site) {
return { return {
...acc, ...acc,
[slugify(key, { delimiter: '_' })]: value.trim(), [slugify(key, '_')]: value.trim(),
}; };
}, {}); }, {});
@ -272,7 +272,7 @@ function scrapeProfileT1({ el, qu }, site) {
return { return {
...acc, ...acc,
[slugify(key, { delimiter: '_' })]: value.trim(), [slugify(key, '_')]: value.trim(),
}; };
}, {}); }, {});
@ -308,7 +308,7 @@ function scrapeProfileTour({ el, qu }, site) {
return { return {
...acc, ...acc,
[slugify(key, { delimiter: '_' })]: value.trim(), [slugify(key, '_')]: value.trim(),
}; };
}, {}); }, {});
@ -382,7 +382,7 @@ async function fetchScene(url, site, baseRelease, beforeFetchLatest) {
} }
async function fetchProfile(actorName, scraperSlug, site) { async function fetchProfile(actorName, scraperSlug, site) {
const actorSlugA = slugify(actorName, { delimiter: '' }); const actorSlugA = slugify(actorName, '');
const actorSlugB = slugify(actorName); const actorSlugB = slugify(actorName);
const t1 = site.parameters?.t1 ? 't1/' : ''; const t1 = site.parameters?.t1 ? 't1/' : '';

View File

@ -384,8 +384,8 @@ async function fetchMovie(url, site) {
} }
async function fetchProfile(actorName) { async function fetchProfile(actorName) {
const actorSlugA = slugify(actorName, { delimiter: '-' }); const actorSlugA = slugify(actorName, '-');
const actorSlugB = slugify(actorName, { delimiter: '' }); const actorSlugB = slugify(actorName, '');
const urlA = `https://julesjordan.com/trial/models/${actorSlugA}.html`; const urlA = `https://julesjordan.com/trial/models/${actorSlugA}.html`;
const urlB = `https://julesjordan.com/trial/models/${actorSlugB}.html`; const urlB = `https://julesjordan.com/trial/models/${actorSlugB}.html`;

View File

@ -98,7 +98,7 @@ function scrapeScene(data, url, _site, networkName) {
} }
const siteName = data.collections[0]?.name || data.brand; const siteName = data.collections[0]?.name || data.brand;
release.channel = slugify(siteName, { delimiter: '' }); release.channel = slugify(siteName, '');
release.url = url || `https://www.${networkName || data.brand}.com/scene/${entryId}/`; release.url = url || `https://www.${networkName || data.brand}.com/scene/${entryId}/`;

View File

@ -94,7 +94,7 @@ function scrapeProfile({ qu }, _actorName, origin) {
const keys = qu.all('.model-profile h5', true); const keys = qu.all('.model-profile h5', true);
const values = qu.all('.model-profile h5 + p', true); const values = qu.all('.model-profile h5 + p', true);
const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, { delimiter: '_' })]: values[index] }), {}); const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, '_')]: values[index] }), {});
profile.age = Number(bio.age); profile.age = Number(bio.age);
profile.description = qu.q('.model-bio', true); profile.description = qu.q('.model-bio', true);

View File

@ -95,7 +95,7 @@ async function scrapeScene(html, url, site) {
release.movie = $('a[data-track="FULL MOVIE"]').attr('href'); release.movie = $('a[data-track="FULL MOVIE"]').attr('href');
const siteElement = $('.content-wrapper .logos-sites a'); const siteElement = $('.content-wrapper .logos-sites a');
if (siteElement) release.channel = slugify(siteElement.text(), { delimiter: '' }); if (siteElement) release.channel = slugify(siteElement.text(), '');
return release; return release;
} }
@ -108,7 +108,7 @@ function scrapeProfile({ html, q, qa, qtx }) {
const trimmedValue = value.trim(); const trimmedValue = value.trim();
if (trimmedValue.length === 0 || trimmedValue === '-') return acc; if (trimmedValue.length === 0 || trimmedValue === '-') return acc;
return { ...acc, [slugify(key, { delimiter: '_' })]: trimmedValue }; return { ...acc, [slugify(key, '_')]: trimmedValue };
}, {}); }, {});
const description = q('.model-facts-long', true); const description = q('.model-facts-long', true);
@ -176,7 +176,7 @@ async function fetchScene(url, site) {
} }
async function fetchProfile(actorName) { async function fetchProfile(actorName) {
const actorSearchSlug = slugify(actorName, { delimiter: '+' }); const actorSearchSlug = slugify(actorName, '+');
const url = `https://www.private.com/search.php?query=${actorSearchSlug}`; const url = `https://www.private.com/search.php?query=${actorSearchSlug}`;
const modelRes = await geta(url, '.model h3 a'); const modelRes = await geta(url, '.model h3 a');

View File

@ -155,7 +155,7 @@ async function scrapeProfile(html, actorUrl, withReleases) {
const bio = qa('.stat').reduce((acc, el) => { const bio = qa('.stat').reduce((acc, el) => {
const prop = q(el, '.label', true).slice(0, -1); const prop = q(el, '.label', true).slice(0, -1);
const key = slugify(prop, { delimiter: '_' }); const key = slugify(prop, '_');
const value = q(el, '.value', true); const value = q(el, '.value', true);
return { return {

View File

@ -7,19 +7,13 @@ const argv = require('./argv');
const knex = require('./knex'); const knex = require('./knex');
const whereOr = require('./utils/where-or'); const whereOr = require('./utils/where-or');
async function curateSite(site, includeParameters = false) { async function curateSite(site, includeParameters = false, includeTags = true) {
const tags = await knex('sites_tags') const curatedSite = {
.select('tags.*', 'sites_tags.inherit')
.where('site_id', site.id)
.join('tags', 'tags.id', 'sites_tags.tag_id');
return {
id: site.id, id: site.id,
name: site.name, name: site.name,
url: site.url, url: site.url,
description: site.description, description: site.description,
slug: site.slug, slug: site.slug,
tags,
independent: !!site.parameters && site.parameters.independent, independent: !!site.parameters && site.parameters.independent,
parameters: includeParameters ? site.parameters : null, parameters: includeParameters ? site.parameters : null,
network: { network: {
@ -31,6 +25,15 @@ async function curateSite(site, includeParameters = false) {
parameters: includeParameters ? site.network_parameters : null, parameters: includeParameters ? site.network_parameters : null,
}, },
}; };
if (includeTags) {
curatedSite.tags = await knex('sites_tags')
.select('tags.*', 'sites_tags.inherit')
.where('site_id', site.id)
.join('tags', 'tags.id', 'sites_tags.tag_id');
}
return curatedSite;
} }
function curateSites(sites, includeParameters) { function curateSites(sites, includeParameters) {
@ -78,7 +81,7 @@ async function findSiteByUrl(url) {
.first(); .first();
if (site) { if (site) {
const curatedSite = curateSite(site, true); const curatedSite = curateSite(site, true, false);
return curatedSite; return curatedSite;
} }
@ -182,6 +185,7 @@ async function fetchSitesFromReleases() {
} }
module.exports = { module.exports = {
curateSite,
curateSites, curateSites,
fetchIncludedSites, fetchIncludedSites,
fetchSites, fetchSites,

71
src/store-releases.js Normal file
View File

@ -0,0 +1,71 @@
'use strict';
const config = require('config');
const knex = require('./knex');
const slugify = require('./utils/slugify');
function curateReleaseEntry(release, batchId, existingRelease) {
const slug = slugify(release.title, '-', {
encode: true,
limit: config.titleSlugLength,
});
const curatedRelease = {
title: release.title,
entry_id: release.entryId || null,
site_id: release.site.id,
shoot_id: release.shootId || null,
studio_id: release.studio?.id || null,
url: release.url,
date: release.date,
slug,
description: release.description,
duration: release.duration,
type: release.type,
// director: release.director,
// likes: release.rating && release.rating.likes,
// dislikes: release.rating && release.rating.dislikes,
// rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
deep: typeof release.deep === 'boolean' ? release.deep : false,
deep_url: release.deepUrl,
updated_batch_id: batchId,
};
if (!existingRelease) {
curatedRelease.created_batch_id = batchId;
}
return curatedRelease;
}
async function attachSite(releases) {
const releasesWithoutSite = releases.filter(release => !release.site || release.site.isFallback);
// console.log(releases, releasesWithoutSite);
}
async function extractUniqueReleases(releases) {
const duplicateReleaseEntries = await knex('releases')
.whereIn(['entry_id', 'site_id'], releases.map(release => [release.entryId, release.site.id]));
const duplicateReleaseEntryKeys = new Set(duplicateReleaseEntries.map(releaseEntry => `${releaseEntry.site_id}_${releaseEntry.entry_id}`));
const uniqueReleases = releases.filter(release => !duplicateReleaseEntryKeys.has(`${release.site.id}_${release.entryId}`));
return uniqueReleases;
}
async function storeReleases(releases) {
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
const uniqueReleases = await extractUniqueReleases(releases);
const releasesWithSites = await attachSite(releases);
const curatedReleaseEntries = uniqueReleases.slice(0, 2).map(release => curateReleaseEntry(release, batchId));
await knex('releases').insert(curatedReleaseEntries);
}
module.exports = {
storeReleases,
};

View File

@ -83,7 +83,10 @@ async function scrapeLatestReleases(scraper, site, preData) {
const latestReleasesWithSite = latestReleases.map(release => ({ ...release, site: release.site || site })); // attach site release is assigned to when stored const latestReleasesWithSite = latestReleases.map(release => ({ ...release, site: release.site || site })); // attach site release is assigned to when stored
const oldestReleaseOnPage = latestReleases.sort((releaseA, releaseB) => releaseB.date - releaseA.date).slice(-1)[0]; const oldestReleaseOnPage = latestReleases.sort((releaseA, releaseB) => releaseB.date - releaseA.date).slice(-1)[0];
const uniqueReleases = await extractUniqueReleases(latestReleasesWithSite, accReleases); const uniqueReleases = argv.redownload
? latestReleasesWithSite
: await extractUniqueReleases(latestReleasesWithSite, accReleases);
const pageAccReleases = accReleases.concat(uniqueReleases); const pageAccReleases = accReleases.concat(uniqueReleases);
logger.verbose(`Scraped '${site.name}' (${site.network.name}) page ${page}, found ${uniqueReleases.length} unique releases`); logger.verbose(`Scraped '${site.name}' (${site.network.name}) page ${page}, found ${uniqueReleases.length} unique releases`);
@ -204,7 +207,9 @@ async function fetchUpdates() {
{ concurrency: 5 }, { concurrency: 5 },
); );
return scrapedNetworks; const releases = scrapedNetworks.flat(2);
return releases;
} }
module.exports = fetchUpdates; module.exports = fetchUpdates;

View File

@ -1,13 +1,14 @@
'use strict'; 'use strict';
function slugify(string, { function slugify(string, delimiter = '-', {
encode = false, encode = false,
delimiter = '-',
limit = 1000, limit = 1000,
} = {}) { } = {}) {
const slugComponents = string.trim().toLowerCase().match(/\w+/g); const slugComponents = string.trim().toLowerCase().match(/\w+/g);
if (!slugComponents) return ''; if (!slugComponents) {
return '';
}
const slug = slugComponents.reduce((acc, component, index) => { const slug = slugComponents.reduce((acc, component, index) => {
const accSlug = `${acc}${index > 0 ? delimiter : ''}${component}`; const accSlug = `${acc}${index > 0 ? delimiter : ''}${component}`;