Refactoring deep scrape. Added tag posters.
After Width: | Height: | Size: 5.0 MiB |
After Width: | Height: | Size: 96 KiB |
After Width: | Height: | Size: 1.0 MiB |
After Width: | Height: | Size: 102 KiB |
After Width: | Height: | Size: 1.3 MiB |
After Width: | Height: | Size: 94 KiB |
Before Width: | Height: | Size: 101 KiB |
Before Width: | Height: | Size: 17 KiB |
|
@ -6,6 +6,7 @@ const tagPosters = [
|
|||
['bdsm', 0, 'Dani Daniels in "The Traning of Dani Daniels, Day 2" for The Training of O at Kink'],
|
||||
['behind-the-scenes', 0, 'Janice Griffith in "Day With A Pornstar: Janice" for Brazzers'],
|
||||
['da-tp', 0, 'Natasha Teen in LegalPorno SZ2164'],
|
||||
['deepthroat', 0, 'Chanel Grey in "Deepthroating Is Fun" for Throated'],
|
||||
['double-anal', 5, 'Riley Reid in "The Gangbang of Riley Reid" for Jules Jordan'],
|
||||
['double-penetration', 'poster', 'Mia Malkova in "DP!" for HardX'],
|
||||
['double-vaginal', 'poster', 'Riley Reid in "Pizza That Ass" for Reid My Lips'],
|
||||
|
@ -13,7 +14,7 @@ const tagPosters = [
|
|||
['oral-creampie', 1, 'Keisha Grey in Brazzers House'],
|
||||
['triple-anal', 'poster', 'Kristy Black in SZ1986 for LegalPorno'],
|
||||
['airtight', 1, 'Jynx Maze in "Pump My Ass Full of Cum 3" for Jules Jordan'],
|
||||
['anal', 'poster', 'Jynx Maze in "Anal Buffet 6" for Evil Angel'],
|
||||
['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'],
|
||||
['asian', 'poster', 'Vina Sky in "Young and Glamorous 10" for Jules Jordan'],
|
||||
['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'],
|
||||
['blowbang', 'poster'],
|
||||
|
@ -27,7 +28,7 @@ const tagPosters = [
|
|||
['gaping', 1, 'Vina Sky in "Vina Sky Does Anal" for HardX'],
|
||||
['interracial', 'poster'],
|
||||
['latina', 'poster'],
|
||||
['mff', 'poster'],
|
||||
['mff', 0, 'Madison Ivy and Adriana Chechik in "Day With A Pornstar" for Brazzers'],
|
||||
['mfm', 'poster'],
|
||||
['orgy', 'poster'],
|
||||
['schoolgirl', 1, 'Eliza Ibarra for Brazzers'],
|
||||
|
@ -47,6 +48,7 @@ const tagPosters = [
|
|||
const tagPhotos = [
|
||||
['airtight', 2, 'Dakota Skye in "Dakota Goes Nuts" for ArchAngel'],
|
||||
['airtight', 3, 'Anita Bellini in "Triple Dick Gangbang" for Hands On Hardcore (DDF Network)'],
|
||||
['anal', 'poster', 'Jynx Maze in "Anal Buffet 6" for Evil Angel'],
|
||||
['anal', 0],
|
||||
['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'],
|
||||
['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'],
|
||||
|
|
|
@ -0,0 +1,539 @@
|
|||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
const UrlPattern = require('url-pattern');
|
||||
const moment = require('moment');
|
||||
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const argv = require('./argv');
|
||||
const include = require('./utils/argv-include')(argv);
|
||||
const scrapers = require('./scrapers/scrapers');
|
||||
const whereOr = require('./utils/where-or');
|
||||
const resolvePlace = require('./utils/resolve-place');
|
||||
const slugify = require('./utils/slugify');
|
||||
const capitalize = require('./utils/capitalize');
|
||||
const { curateSites } = require('./sites');
|
||||
const { storeMedia, associateMedia } = require('./media');
|
||||
|
||||
async function curateActor(actor) {
|
||||
const [aliases, avatar, photos, social] = await Promise.all([
|
||||
knex('actors').where({ alias_for: actor.id }),
|
||||
knex('actors_avatars')
|
||||
.where('actor_id', actor.id)
|
||||
.join('media', 'media.id', 'actors_avatars.media_id')
|
||||
.first(),
|
||||
knex('actors_photos')
|
||||
.where('actor_id', actor.id)
|
||||
.join('media', 'media.id', 'actors_photos.media_id')
|
||||
.orderBy('index'),
|
||||
knex('actors_social')
|
||||
.where('actor_id', actor.id)
|
||||
.orderBy('platform', 'desc'),
|
||||
]);
|
||||
|
||||
const curatedActor = {
|
||||
id: actor.id,
|
||||
gender: actor.gender,
|
||||
name: actor.name,
|
||||
description: actor.description,
|
||||
birthdate: actor.birthdate && new Date(actor.birthdate),
|
||||
country: actor.country_alpha2,
|
||||
origin: (actor.birth_city || actor.birth_state || actor.birth_country_alpha2) ? {} : null,
|
||||
residence: (actor.residence_city || actor.residence_state || actor.residence_country_alpha2) ? {} : null,
|
||||
ethnicity: actor.ethnicity,
|
||||
height: actor.height,
|
||||
weight: actor.weight,
|
||||
bust: actor.bust,
|
||||
waist: actor.waist,
|
||||
hip: actor.hip,
|
||||
naturalBoobs: actor.natural_boobs,
|
||||
aliases: aliases.map(({ name }) => name),
|
||||
slug: actor.slug,
|
||||
avatar,
|
||||
photos,
|
||||
hasTattoos: actor.has_tattoos,
|
||||
hasPiercings: actor.has_piercings,
|
||||
tattoos: actor.tattoos,
|
||||
piercings: actor.piercings,
|
||||
social,
|
||||
scrapedAt: actor.scraped_at,
|
||||
};
|
||||
|
||||
if (curatedActor.birthdate) {
|
||||
curatedActor.age = moment().diff(curatedActor.birthdate, 'years');
|
||||
}
|
||||
|
||||
if (actor.birth_city) curatedActor.origin.city = actor.birth_city;
|
||||
if (actor.birth_state) curatedActor.origin.state = actor.birth_state;
|
||||
|
||||
if (actor.birth_country_alpha2) {
|
||||
curatedActor.origin.country = {
|
||||
alpha2: actor.birth_country_alpha2,
|
||||
name: actor.birth_country_name,
|
||||
alias: actor.birth_country_alias,
|
||||
};
|
||||
}
|
||||
|
||||
if (actor.residence_city) curatedActor.residence.city = actor.residence_city;
|
||||
if (actor.residence_state) curatedActor.residence.state = actor.residence_state;
|
||||
|
||||
if (actor.residence_country_alpha2) {
|
||||
curatedActor.residence.country = {
|
||||
alpha2: actor.residence_country_alpha2,
|
||||
name: actor.residence_country_name,
|
||||
alias: actor.residence_country_alias,
|
||||
};
|
||||
}
|
||||
|
||||
return curatedActor;
|
||||
}
|
||||
|
||||
function curateActors(releases) {
|
||||
return Promise.all(releases.map(async release => curateActor(release)));
|
||||
}
|
||||
|
||||
function curateActorEntry(actor, scraped, scrapeSuccess) {
|
||||
const curatedActor = {
|
||||
name: capitalize(actor.name),
|
||||
slug: slugify(actor.name),
|
||||
birthdate: actor.birthdate,
|
||||
description: actor.description,
|
||||
gender: actor.gender,
|
||||
ethnicity: actor.ethnicity,
|
||||
bust: actor.bust,
|
||||
waist: actor.waist,
|
||||
hip: actor.hip,
|
||||
natural_boobs: actor.naturalBoobs,
|
||||
height: actor.height,
|
||||
weight: actor.weight,
|
||||
hair: actor.hair,
|
||||
eyes: actor.eyes,
|
||||
has_tattoos: actor.hasTattoos,
|
||||
has_piercings: actor.hasPiercings,
|
||||
tattoos: actor.tattoos,
|
||||
piercings: actor.piercings,
|
||||
};
|
||||
|
||||
if (actor.id) {
|
||||
curatedActor.id = actor.id;
|
||||
}
|
||||
|
||||
if (actor.birthPlace) {
|
||||
curatedActor.birth_city = actor.birthPlace.city;
|
||||
curatedActor.birth_state = actor.birthPlace.state;
|
||||
curatedActor.birth_country_alpha2 = actor.birthPlace.country;
|
||||
}
|
||||
|
||||
if (actor.residencePlace) {
|
||||
curatedActor.residence_city = actor.residencePlace.city;
|
||||
curatedActor.residence_state = actor.residencePlace.state;
|
||||
curatedActor.residence_country_alpha2 = actor.residencePlace.country;
|
||||
}
|
||||
|
||||
if (scraped) {
|
||||
curatedActor.scraped_at = new Date();
|
||||
curatedActor.scrape_success = scrapeSuccess;
|
||||
}
|
||||
|
||||
return curatedActor;
|
||||
}
|
||||
|
||||
function curateSocialEntry(url, actorId) {
|
||||
const platforms = [
|
||||
// links supplied by PH often look like domain.com/domain.com/username
|
||||
{
|
||||
label: 'twitter',
|
||||
pattern: 'http(s)\\://(*)twitter.com/:username(/)(?*)',
|
||||
format: username => `https://www.twitter.com/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'youtube',
|
||||
pattern: 'http(s)\\://(*)youtube.com/channel/:username(?*)',
|
||||
format: username => `https://www.youtube.com/channel/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'instagram',
|
||||
pattern: 'http(s)\\://(*)instagram.com/:username(/)(?*)',
|
||||
format: username => `https://www.instagram.com/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'snapchat',
|
||||
pattern: 'http(s)\\://(*)snapchat.com/add/:username(/)(?*)',
|
||||
format: username => `https://www.snapchat.com/add/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'tumblr',
|
||||
pattern: 'http(s)\\://:username.tumblr.com(*)',
|
||||
format: username => `https://${username}.tumblr.com`,
|
||||
},
|
||||
{
|
||||
label: 'onlyfans',
|
||||
pattern: 'http(s)\\://(*)onlyfans.com/:username(/)(?*)',
|
||||
format: username => `https://www.onlyfans.com/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'fancentro',
|
||||
pattern: 'http(s)\\://(*)fancentro.com/:username(/)(?*)',
|
||||
format: username => `https://www.fancentro.com/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'modelhub',
|
||||
pattern: 'http(s)\\://(*)modelhub.com/:username(/)(?*)',
|
||||
format: username => `https://www.modelhub.com/${username}`,
|
||||
},
|
||||
];
|
||||
|
||||
const match = platforms.reduce((acc, platform) => {
|
||||
if (acc) return acc;
|
||||
|
||||
const patternMatch = new UrlPattern(platform.pattern).match(url);
|
||||
|
||||
if (patternMatch) {
|
||||
return {
|
||||
platform: platform.label,
|
||||
original: url,
|
||||
username: patternMatch.username,
|
||||
url: platform.format ? platform.format(patternMatch.username) : url,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}, null) || { url };
|
||||
|
||||
return {
|
||||
url: match.url,
|
||||
platform: match.platform,
|
||||
actor_id: actorId,
|
||||
};
|
||||
}
|
||||
|
||||
async function curateSocialEntries(urls, actorId) {
|
||||
if (!urls) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const existingSocialLinks = await knex('actors_social').where('actor_id', actorId);
|
||||
|
||||
return urls.reduce((acc, url) => {
|
||||
const socialEntry = curateSocialEntry(url, actorId);
|
||||
|
||||
if (acc.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase()) || existingSocialLinks.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase())) {
|
||||
// prevent duplicates
|
||||
return acc;
|
||||
}
|
||||
|
||||
return [...acc, socialEntry];
|
||||
}, []);
|
||||
}
|
||||
|
||||
async function fetchActors(queryObject, limit = 100) {
|
||||
const releases = await knex('actors')
|
||||
.select(
|
||||
'actors.*',
|
||||
'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name', 'birth_countries.alias as birth_country_alias',
|
||||
'residence_countries.alpha2 as residence_country_alpha2', 'residence_countries.name as residence_country_name', 'residence_countries.alias as residence_country_alias',
|
||||
)
|
||||
.leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2')
|
||||
.leftJoin('countries as residence_countries', 'actors.residence_country_alpha2', 'residence_countries.alpha2')
|
||||
.orderBy(['actors.name', 'actors.gender'])
|
||||
.where(builder => whereOr(queryObject, 'actors', builder))
|
||||
.limit(limit);
|
||||
|
||||
return curateActors(releases);
|
||||
}
|
||||
|
||||
async function storeSocialLinks(urls, actorId) {
|
||||
const curatedSocialEntries = await curateSocialEntries(urls, actorId);
|
||||
|
||||
await knex('actors_social').insert(curatedSocialEntries);
|
||||
}
|
||||
|
||||
async function storeAvatars(avatars, actorId) {
|
||||
if (!avatars || avatars.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const avatarsBySource = await storeMedia(avatars, 'actor', 'avatar');
|
||||
await associateMedia({ [actorId]: avatars }, avatarsBySource, 'actor', 'photo', 'avatar');
|
||||
|
||||
return avatarsBySource;
|
||||
}
|
||||
|
||||
async function storeActor(actor, scraped = false, scrapeSuccess = false) {
|
||||
const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess);
|
||||
|
||||
const [actorEntry] = await knex('actors')
|
||||
.insert(curatedActor)
|
||||
.returning('*');
|
||||
|
||||
await storeSocialLinks(actor.social, actorEntry.id);
|
||||
|
||||
if (actor.avatars) {
|
||||
await storeAvatars(actor.avatars, actorEntry.id);
|
||||
}
|
||||
|
||||
logger.info(`Added new entry for actor '${actor.name}'`);
|
||||
|
||||
return actorEntry;
|
||||
}
|
||||
|
||||
async function updateActor(actor, scraped = false, scrapeSuccess = false) {
|
||||
const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess);
|
||||
|
||||
const [actorEntry] = await knex('actors')
|
||||
.where({ id: actor.id })
|
||||
.update(curatedActor)
|
||||
.returning('*');
|
||||
|
||||
await storeSocialLinks(actor.social, actor.id);
|
||||
|
||||
logger.info(`Updated entry for actor '${actor.name}'`);
|
||||
|
||||
return actorEntry;
|
||||
}
|
||||
|
||||
async function mergeProfiles(profiles, actor) {
|
||||
if (profiles.filter(Boolean).length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const mergedProfile = profiles.reduce((prevProfile, profile) => {
|
||||
if (profile === null) {
|
||||
return prevProfile;
|
||||
}
|
||||
|
||||
const accProfile = {
|
||||
id: actor ? actor.id : null,
|
||||
name: actor ? actor.name : (prevProfile.name || profile.name),
|
||||
description: prevProfile.description || profile.description,
|
||||
gender: prevProfile.gender || profile.gender,
|
||||
birthdate: !prevProfile.birthdate || Number.isNaN(Number(prevProfile.birthdate)) ? profile.birthdate : prevProfile.birthdate,
|
||||
birthPlace: prevProfile.birthPlace || profile.birthPlace,
|
||||
residencePlace: prevProfile.residencePlace || profile.residencePlace,
|
||||
nationality: prevProfile.nationality || profile.nationality, // used to derive country when not available
|
||||
ethnicity: prevProfile.ethnicity || profile.ethnicity,
|
||||
bust: prevProfile.bust || (/\d+\w+/.test(profile.bust) ? profile.bust : null),
|
||||
waist: prevProfile.waist || profile.waist,
|
||||
hip: prevProfile.hip || profile.hip,
|
||||
naturalBoobs: prevProfile.naturalBoobs === undefined ? profile.naturalBoobs : prevProfile.naturalBoobs,
|
||||
height: prevProfile.height || profile.height,
|
||||
weight: prevProfile.weight || profile.weight,
|
||||
hair: prevProfile.hair || profile.hair,
|
||||
eyes: prevProfile.eyes || profile.eyes,
|
||||
hasPiercings: prevProfile.hasPiercings === undefined ? profile.hasPiercings : prevProfile.hasPiercings,
|
||||
hasTattoos: prevProfile.hasTattoos === undefined ? profile.hasTattoos : prevProfile.hasTattoos,
|
||||
piercings: prevProfile.piercings || profile.piercings,
|
||||
tattoos: prevProfile.tattoos || profile.tattoos,
|
||||
social: prevProfile.social.concat(profile.social || []),
|
||||
releases: prevProfile.releases.concat(profile.releases ? profile.releases : []), // don't flatten fallbacks
|
||||
};
|
||||
|
||||
if (profile.avatar) {
|
||||
const avatar = Array.isArray(profile.avatar)
|
||||
? profile.avatar.map(avatarX => ({
|
||||
src: avatarX.src || avatarX,
|
||||
scraper: profile.scraper,
|
||||
copyright: avatarX.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
|
||||
}))
|
||||
: {
|
||||
src: profile.avatar.src || profile.avatar,
|
||||
scraper: profile.scraper,
|
||||
copyright: profile.avatar.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
|
||||
};
|
||||
|
||||
accProfile.avatars = prevProfile.avatars.concat([avatar]); // don't flatten fallbacks
|
||||
} else {
|
||||
accProfile.avatars = prevProfile.avatars;
|
||||
}
|
||||
|
||||
return accProfile;
|
||||
}, {
|
||||
social: [],
|
||||
avatars: [],
|
||||
releases: [],
|
||||
});
|
||||
|
||||
const [birthPlace, residencePlace] = await Promise.all([
|
||||
resolvePlace(mergedProfile.birthPlace),
|
||||
resolvePlace(mergedProfile.residencePlace),
|
||||
]);
|
||||
|
||||
mergedProfile.birthPlace = birthPlace;
|
||||
mergedProfile.residencePlace = residencePlace;
|
||||
|
||||
if (!mergedProfile.birthPlace && mergedProfile.nationality) {
|
||||
const country = await knex('countries')
|
||||
.where('nationality', 'ilike', `%${mergedProfile.nationality}%`)
|
||||
.orderBy('priority', 'desc')
|
||||
.first();
|
||||
|
||||
mergedProfile.birthPlace = {
|
||||
country: country.alpha2,
|
||||
};
|
||||
}
|
||||
|
||||
return mergedProfile;
|
||||
}
|
||||
|
||||
async function scrapeProfiles(sources, actorName, actorEntry, sitesBySlug) {
|
||||
return Promise.map(sources, async (source) => {
|
||||
// const [scraperSlug, scraper] = source;
|
||||
const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] }));
|
||||
|
||||
try {
|
||||
return await profileScrapers.reduce(async (outcome, { scraper, scraperSlug }) => outcome.catch(async () => {
|
||||
if (!scraper) {
|
||||
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
||||
throw Object.assign(new Error(`No profile scraper available for ${scraperSlug}`));
|
||||
}
|
||||
|
||||
logger.verbose(`Searching '${actorName}' on ${scraperSlug}`);
|
||||
|
||||
const site = sitesBySlug[scraperSlug] || null;
|
||||
const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, site, include);
|
||||
|
||||
if (profile) {
|
||||
logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`);
|
||||
|
||||
return {
|
||||
...profile,
|
||||
name: actorName,
|
||||
scraper: scraperSlug,
|
||||
site,
|
||||
releases: profile.releases?.map(release => (typeof release === 'string'
|
||||
? { url: release, site }
|
||||
: { ...release, site: release.site || site }
|
||||
)),
|
||||
};
|
||||
}
|
||||
|
||||
logger.verbose(`No profile for '${actorName}' available on ${scraperSlug}`);
|
||||
throw Object.assign(new Error(`Profile for ${actorName} not available on ${scraperSlug}`), { warn: false });
|
||||
}), Promise.reject(new Error()));
|
||||
} catch (error) {
|
||||
if (error.warn !== false) {
|
||||
logger.warn(`Error in scraper ${source}: ${error.message}`);
|
||||
// logger.error(error.stack);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeActors(actorNames) {
|
||||
return Promise.map(actorNames || argv.actors, async (actorName) => {
|
||||
try {
|
||||
const actorSlug = slugify(actorName);
|
||||
const actorEntry = await knex('actors').where({ slug: actorSlug }).first();
|
||||
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
|
||||
|
||||
const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested
|
||||
|
||||
const [siteEntries, networkEntries] = await Promise.all([
|
||||
knex('sites')
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.select(
|
||||
'sites.*',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||
)
|
||||
.whereIn('sites.slug', finalSources.flat()),
|
||||
knex('networks').select('*').whereIn('slug', finalSources.flat()),
|
||||
]);
|
||||
|
||||
const sites = await curateSites(siteEntries, true);
|
||||
const networks = networkEntries.map(network => ({ ...network, isFallback: true }));
|
||||
const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
|
||||
|
||||
const profiles = await scrapeProfiles(sources, actorName, actorEntry, sitesBySlug);
|
||||
const profile = await mergeProfiles(profiles, actorEntry);
|
||||
|
||||
if (profile === null) {
|
||||
logger.warn(`Could not find profile for actor '${actorName}'`);
|
||||
|
||||
if (argv.save && !actorEntry) {
|
||||
await storeActor({ name: actorName }, false, false);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
if (argv.inspect) {
|
||||
console.log(profile);
|
||||
logger.info(`Found ${profile.releases.length} releases for ${actorName}`);
|
||||
}
|
||||
|
||||
if (argv.save) {
|
||||
if (actorEntry && profile) {
|
||||
await Promise.all([
|
||||
updateActor(profile, true, true),
|
||||
storeAvatars(profile.avatars, actorEntry.id),
|
||||
]);
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
await storeActor(profile, true, true);
|
||||
}
|
||||
|
||||
return profile;
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
logger.warn(`${actorName}: ${error}`);
|
||||
|
||||
return null;
|
||||
}
|
||||
}, {
|
||||
concurrency: 3,
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeBasicActors() {
|
||||
const basicActors = await knex('actors').where('scraped_at', null);
|
||||
|
||||
return scrapeActors(basicActors.map(actor => actor.name));
|
||||
}
|
||||
|
||||
async function associateActors(mappedActors, releases) {
|
||||
const [existingActorEntries, existingAssociationEntries] = await Promise.all([
|
||||
knex('actors')
|
||||
.whereIn('name', Object.values(mappedActors).map(actor => actor.name))
|
||||
.orWhereIn('slug', Object.keys(mappedActors)),
|
||||
knex('releases_actors').whereIn('release_id', releases.map(release => release.id)),
|
||||
]);
|
||||
|
||||
const associations = await Promise.map(Object.entries(mappedActors), async ([actorSlug, actor]) => {
|
||||
try {
|
||||
const actorEntry = existingActorEntries.find(actorX => actorX.slug === actorSlug)
|
||||
|| await storeActor(actor);
|
||||
|
||||
// if a scene
|
||||
return Array.from(actor.releaseIds)
|
||||
.map(releaseId => ({
|
||||
release_id: releaseId,
|
||||
actor_id: actorEntry.id,
|
||||
}))
|
||||
.filter(association => !existingAssociationEntries
|
||||
// remove associations already in database
|
||||
.some(associationEntry => associationEntry.actor_id === association.actor_id
|
||||
&& associationEntry.release_id === association.release_id));
|
||||
} catch (error) {
|
||||
logger.error(actor.name, error);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
|
||||
await knex('releases_actors').insert(associations.filter(association => association).flat());
|
||||
|
||||
// basic actor scraping is failure prone, don't run together with actor association
|
||||
// await scrapebasicactors(),
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
associateActors,
|
||||
fetchActors,
|
||||
scrapeActors,
|
||||
scrapeBasicActors,
|
||||
};
|
541
src/actors.js
|
@ -1,539 +1,26 @@
|
|||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
const UrlPattern = require('url-pattern');
|
||||
const moment = require('moment');
|
||||
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const argv = require('./argv');
|
||||
const include = require('./utils/argv-include')(argv);
|
||||
const scrapers = require('./scrapers/scrapers');
|
||||
const whereOr = require('./utils/where-or');
|
||||
const resolvePlace = require('./utils/resolve-place');
|
||||
const slugify = require('./utils/slugify');
|
||||
const capitalize = require('./utils/capitalize');
|
||||
const { curateSites } = require('./sites');
|
||||
const { storeMedia, associateMedia } = require('./media');
|
||||
|
||||
async function curateActor(actor) {
|
||||
const [aliases, avatar, photos, social] = await Promise.all([
|
||||
knex('actors').where({ alias_for: actor.id }),
|
||||
knex('actors_avatars')
|
||||
.where('actor_id', actor.id)
|
||||
.join('media', 'media.id', 'actors_avatars.media_id')
|
||||
.first(),
|
||||
knex('actors_photos')
|
||||
.where('actor_id', actor.id)
|
||||
.join('media', 'media.id', 'actors_photos.media_id')
|
||||
.orderBy('index'),
|
||||
knex('actors_social')
|
||||
.where('actor_id', actor.id)
|
||||
.orderBy('platform', 'desc'),
|
||||
]);
|
||||
|
||||
const curatedActor = {
|
||||
id: actor.id,
|
||||
gender: actor.gender,
|
||||
name: actor.name,
|
||||
description: actor.description,
|
||||
birthdate: actor.birthdate && new Date(actor.birthdate),
|
||||
country: actor.country_alpha2,
|
||||
origin: (actor.birth_city || actor.birth_state || actor.birth_country_alpha2) ? {} : null,
|
||||
residence: (actor.residence_city || actor.residence_state || actor.residence_country_alpha2) ? {} : null,
|
||||
ethnicity: actor.ethnicity,
|
||||
height: actor.height,
|
||||
weight: actor.weight,
|
||||
bust: actor.bust,
|
||||
waist: actor.waist,
|
||||
hip: actor.hip,
|
||||
naturalBoobs: actor.natural_boobs,
|
||||
aliases: aliases.map(({ name }) => name),
|
||||
slug: actor.slug,
|
||||
avatar,
|
||||
photos,
|
||||
hasTattoos: actor.has_tattoos,
|
||||
hasPiercings: actor.has_piercings,
|
||||
tattoos: actor.tattoos,
|
||||
piercings: actor.piercings,
|
||||
social,
|
||||
scrapedAt: actor.scraped_at,
|
||||
};
|
||||
|
||||
if (curatedActor.birthdate) {
|
||||
curatedActor.age = moment().diff(curatedActor.birthdate, 'years');
|
||||
}
|
||||
|
||||
if (actor.birth_city) curatedActor.origin.city = actor.birth_city;
|
||||
if (actor.birth_state) curatedActor.origin.state = actor.birth_state;
|
||||
|
||||
if (actor.birth_country_alpha2) {
|
||||
curatedActor.origin.country = {
|
||||
alpha2: actor.birth_country_alpha2,
|
||||
name: actor.birth_country_name,
|
||||
alias: actor.birth_country_alias,
|
||||
};
|
||||
}
|
||||
|
||||
if (actor.residence_city) curatedActor.residence.city = actor.residence_city;
|
||||
if (actor.residence_state) curatedActor.residence.state = actor.residence_state;
|
||||
|
||||
if (actor.residence_country_alpha2) {
|
||||
curatedActor.residence.country = {
|
||||
alpha2: actor.residence_country_alpha2,
|
||||
name: actor.residence_country_name,
|
||||
alias: actor.residence_country_alias,
|
||||
};
|
||||
}
|
||||
|
||||
return curatedActor;
|
||||
}
|
||||
|
||||
function curateActors(releases) {
|
||||
return Promise.all(releases.map(async release => curateActor(release)));
|
||||
}
|
||||
|
||||
function curateActorEntry(actor, scraped, scrapeSuccess) {
|
||||
const curatedActor = {
|
||||
name: capitalize(actor.name),
|
||||
slug: slugify(actor.name),
|
||||
birthdate: actor.birthdate,
|
||||
description: actor.description,
|
||||
gender: actor.gender,
|
||||
ethnicity: actor.ethnicity,
|
||||
bust: actor.bust,
|
||||
waist: actor.waist,
|
||||
hip: actor.hip,
|
||||
natural_boobs: actor.naturalBoobs,
|
||||
height: actor.height,
|
||||
weight: actor.weight,
|
||||
hair: actor.hair,
|
||||
eyes: actor.eyes,
|
||||
has_tattoos: actor.hasTattoos,
|
||||
has_piercings: actor.hasPiercings,
|
||||
tattoos: actor.tattoos,
|
||||
piercings: actor.piercings,
|
||||
};
|
||||
|
||||
if (actor.id) {
|
||||
curatedActor.id = actor.id;
|
||||
}
|
||||
|
||||
if (actor.birthPlace) {
|
||||
curatedActor.birth_city = actor.birthPlace.city;
|
||||
curatedActor.birth_state = actor.birthPlace.state;
|
||||
curatedActor.birth_country_alpha2 = actor.birthPlace.country;
|
||||
}
|
||||
|
||||
if (actor.residencePlace) {
|
||||
curatedActor.residence_city = actor.residencePlace.city;
|
||||
curatedActor.residence_state = actor.residencePlace.state;
|
||||
curatedActor.residence_country_alpha2 = actor.residencePlace.country;
|
||||
}
|
||||
|
||||
if (scraped) {
|
||||
curatedActor.scraped_at = new Date();
|
||||
curatedActor.scrape_success = scrapeSuccess;
|
||||
}
|
||||
|
||||
return curatedActor;
|
||||
}
|
||||
|
||||
function curateSocialEntry(url, actorId) {
|
||||
const platforms = [
|
||||
// links supplied by PH often look like domain.com/domain.com/username
|
||||
{
|
||||
label: 'twitter',
|
||||
pattern: 'http(s)\\://(*)twitter.com/:username(/)(?*)',
|
||||
format: username => `https://www.twitter.com/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'youtube',
|
||||
pattern: 'http(s)\\://(*)youtube.com/channel/:username(?*)',
|
||||
format: username => `https://www.youtube.com/channel/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'instagram',
|
||||
pattern: 'http(s)\\://(*)instagram.com/:username(/)(?*)',
|
||||
format: username => `https://www.instagram.com/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'snapchat',
|
||||
pattern: 'http(s)\\://(*)snapchat.com/add/:username(/)(?*)',
|
||||
format: username => `https://www.snapchat.com/add/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'tumblr',
|
||||
pattern: 'http(s)\\://:username.tumblr.com(*)',
|
||||
format: username => `https://${username}.tumblr.com`,
|
||||
},
|
||||
{
|
||||
label: 'onlyfans',
|
||||
pattern: 'http(s)\\://(*)onlyfans.com/:username(/)(?*)',
|
||||
format: username => `https://www.onlyfans.com/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'fancentro',
|
||||
pattern: 'http(s)\\://(*)fancentro.com/:username(/)(?*)',
|
||||
format: username => `https://www.fancentro.com/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'modelhub',
|
||||
pattern: 'http(s)\\://(*)modelhub.com/:username(/)(?*)',
|
||||
format: username => `https://www.modelhub.com/${username}`,
|
||||
},
|
||||
];
|
||||
|
||||
const match = platforms.reduce((acc, platform) => {
|
||||
if (acc) return acc;
|
||||
|
||||
const patternMatch = new UrlPattern(platform.pattern).match(url);
|
||||
|
||||
if (patternMatch) {
|
||||
return {
|
||||
platform: platform.label,
|
||||
original: url,
|
||||
username: patternMatch.username,
|
||||
url: platform.format ? platform.format(patternMatch.username) : url,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}, null) || { url };
|
||||
async function storeReleaseActors(releases) {
|
||||
const releaseIdsByActor = releases.reduce(
|
||||
(acc, release) => release.actors.reduce((actorAcc, actor) => {
|
||||
const releaseActor = actor.name ? actor : { name: actor };
|
||||
const actorSlug = slugify(releaseActor.name);
|
||||
|
||||
return {
|
||||
url: match.url,
|
||||
platform: match.platform,
|
||||
actor_id: actorId,
|
||||
...actorAcc,
|
||||
[actorSlug]: actorAcc[actorSlug]
|
||||
? actorAcc[actorSlug].concat(release.id)
|
||||
: [release.id],
|
||||
};
|
||||
}
|
||||
}, acc),
|
||||
{},
|
||||
);
|
||||
|
||||
async function curateSocialEntries(urls, actorId) {
|
||||
if (!urls) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const existingSocialLinks = await knex('actors_social').where('actor_id', actorId);
|
||||
|
||||
return urls.reduce((acc, url) => {
|
||||
const socialEntry = curateSocialEntry(url, actorId);
|
||||
|
||||
if (acc.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase()) || existingSocialLinks.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase())) {
|
||||
// prevent duplicates
|
||||
return acc;
|
||||
}
|
||||
|
||||
return [...acc, socialEntry];
|
||||
}, []);
|
||||
}
|
||||
|
||||
async function fetchActors(queryObject, limit = 100) {
|
||||
const releases = await knex('actors')
|
||||
.select(
|
||||
'actors.*',
|
||||
'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name', 'birth_countries.alias as birth_country_alias',
|
||||
'residence_countries.alpha2 as residence_country_alpha2', 'residence_countries.name as residence_country_name', 'residence_countries.alias as residence_country_alias',
|
||||
)
|
||||
.leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2')
|
||||
.leftJoin('countries as residence_countries', 'actors.residence_country_alpha2', 'residence_countries.alpha2')
|
||||
.orderBy(['actors.name', 'actors.gender'])
|
||||
.where(builder => whereOr(queryObject, 'actors', builder))
|
||||
.limit(limit);
|
||||
|
||||
return curateActors(releases);
|
||||
}
|
||||
|
||||
async function storeSocialLinks(urls, actorId) {
|
||||
const curatedSocialEntries = await curateSocialEntries(urls, actorId);
|
||||
|
||||
await knex('actors_social').insert(curatedSocialEntries);
|
||||
}
|
||||
|
||||
async function storeAvatars(avatars, actorId) {
|
||||
if (!avatars || avatars.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const avatarsBySource = await storeMedia(avatars, 'actor', 'avatar');
|
||||
await associateMedia({ [actorId]: avatars }, avatarsBySource, 'actor', 'photo', 'avatar');
|
||||
|
||||
return avatarsBySource;
|
||||
}
|
||||
|
||||
async function storeActor(actor, scraped = false, scrapeSuccess = false) {
|
||||
const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess);
|
||||
|
||||
const [actorEntry] = await knex('actors')
|
||||
.insert(curatedActor)
|
||||
.returning('*');
|
||||
|
||||
await storeSocialLinks(actor.social, actorEntry.id);
|
||||
|
||||
if (actor.avatars) {
|
||||
await storeAvatars(actor.avatars, actorEntry.id);
|
||||
}
|
||||
|
||||
logger.info(`Added new entry for actor '${actor.name}'`);
|
||||
|
||||
return actorEntry;
|
||||
}
|
||||
|
||||
async function updateActor(actor, scraped = false, scrapeSuccess = false) {
|
||||
const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess);
|
||||
|
||||
const [actorEntry] = await knex('actors')
|
||||
.where({ id: actor.id })
|
||||
.update(curatedActor)
|
||||
.returning('*');
|
||||
|
||||
await storeSocialLinks(actor.social, actor.id);
|
||||
|
||||
logger.info(`Updated entry for actor '${actor.name}'`);
|
||||
|
||||
return actorEntry;
|
||||
}
|
||||
|
||||
async function mergeProfiles(profiles, actor) {
|
||||
if (profiles.filter(Boolean).length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const mergedProfile = profiles.reduce((prevProfile, profile) => {
|
||||
if (profile === null) {
|
||||
return prevProfile;
|
||||
}
|
||||
|
||||
const accProfile = {
|
||||
id: actor ? actor.id : null,
|
||||
name: actor ? actor.name : (prevProfile.name || profile.name),
|
||||
description: prevProfile.description || profile.description,
|
||||
gender: prevProfile.gender || profile.gender,
|
||||
birthdate: !prevProfile.birthdate || Number.isNaN(Number(prevProfile.birthdate)) ? profile.birthdate : prevProfile.birthdate,
|
||||
birthPlace: prevProfile.birthPlace || profile.birthPlace,
|
||||
residencePlace: prevProfile.residencePlace || profile.residencePlace,
|
||||
nationality: prevProfile.nationality || profile.nationality, // used to derive country when not available
|
||||
ethnicity: prevProfile.ethnicity || profile.ethnicity,
|
||||
bust: prevProfile.bust || (/\d+\w+/.test(profile.bust) ? profile.bust : null),
|
||||
waist: prevProfile.waist || profile.waist,
|
||||
hip: prevProfile.hip || profile.hip,
|
||||
naturalBoobs: prevProfile.naturalBoobs === undefined ? profile.naturalBoobs : prevProfile.naturalBoobs,
|
||||
height: prevProfile.height || profile.height,
|
||||
weight: prevProfile.weight || profile.weight,
|
||||
hair: prevProfile.hair || profile.hair,
|
||||
eyes: prevProfile.eyes || profile.eyes,
|
||||
hasPiercings: prevProfile.hasPiercings === undefined ? profile.hasPiercings : prevProfile.hasPiercings,
|
||||
hasTattoos: prevProfile.hasTattoos === undefined ? profile.hasTattoos : prevProfile.hasTattoos,
|
||||
piercings: prevProfile.piercings || profile.piercings,
|
||||
tattoos: prevProfile.tattoos || profile.tattoos,
|
||||
social: prevProfile.social.concat(profile.social || []),
|
||||
releases: prevProfile.releases.concat(profile.releases ? profile.releases : []), // don't flatten fallbacks
|
||||
};
|
||||
|
||||
if (profile.avatar) {
|
||||
const avatar = Array.isArray(profile.avatar)
|
||||
? profile.avatar.map(avatarX => ({
|
||||
src: avatarX.src || avatarX,
|
||||
scraper: profile.scraper,
|
||||
copyright: avatarX.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
|
||||
}))
|
||||
: {
|
||||
src: profile.avatar.src || profile.avatar,
|
||||
scraper: profile.scraper,
|
||||
copyright: profile.avatar.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
|
||||
};
|
||||
|
||||
accProfile.avatars = prevProfile.avatars.concat([avatar]); // don't flatten fallbacks
|
||||
} else {
|
||||
accProfile.avatars = prevProfile.avatars;
|
||||
}
|
||||
|
||||
return accProfile;
|
||||
}, {
|
||||
social: [],
|
||||
avatars: [],
|
||||
releases: [],
|
||||
});
|
||||
|
||||
const [birthPlace, residencePlace] = await Promise.all([
|
||||
resolvePlace(mergedProfile.birthPlace),
|
||||
resolvePlace(mergedProfile.residencePlace),
|
||||
]);
|
||||
|
||||
mergedProfile.birthPlace = birthPlace;
|
||||
mergedProfile.residencePlace = residencePlace;
|
||||
|
||||
if (!mergedProfile.birthPlace && mergedProfile.nationality) {
|
||||
const country = await knex('countries')
|
||||
.where('nationality', 'ilike', `%${mergedProfile.nationality}%`)
|
||||
.orderBy('priority', 'desc')
|
||||
.first();
|
||||
|
||||
mergedProfile.birthPlace = {
|
||||
country: country.alpha2,
|
||||
};
|
||||
}
|
||||
|
||||
return mergedProfile;
|
||||
}
|
||||
|
||||
async function scrapeProfiles(sources, actorName, actorEntry, sitesBySlug) {
|
||||
return Promise.map(sources, async (source) => {
|
||||
// const [scraperSlug, scraper] = source;
|
||||
const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] }));
|
||||
|
||||
try {
|
||||
return await profileScrapers.reduce(async (outcome, { scraper, scraperSlug }) => outcome.catch(async () => {
|
||||
if (!scraper) {
|
||||
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
||||
throw Object.assign(new Error(`No profile scraper available for ${scraperSlug}`));
|
||||
}
|
||||
|
||||
logger.verbose(`Searching '${actorName}' on ${scraperSlug}`);
|
||||
|
||||
const site = sitesBySlug[scraperSlug] || null;
|
||||
const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, site, include);
|
||||
|
||||
if (profile) {
|
||||
logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`);
|
||||
|
||||
return {
|
||||
...profile,
|
||||
name: actorName,
|
||||
scraper: scraperSlug,
|
||||
site,
|
||||
releases: profile.releases?.map(release => (typeof release === 'string'
|
||||
? { url: release, site }
|
||||
: { ...release, site: release.site || site }
|
||||
)),
|
||||
};
|
||||
}
|
||||
|
||||
logger.verbose(`No profile for '${actorName}' available on ${scraperSlug}`);
|
||||
throw Object.assign(new Error(`Profile for ${actorName} not available on ${scraperSlug}`), { warn: false });
|
||||
}), Promise.reject(new Error()));
|
||||
} catch (error) {
|
||||
if (error.warn !== false) {
|
||||
logger.warn(`Error in scraper ${source}: ${error.message}`);
|
||||
// logger.error(error.stack);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeActors(actorNames) {
|
||||
return Promise.map(actorNames || argv.actors, async (actorName) => {
|
||||
try {
|
||||
const actorSlug = slugify(actorName);
|
||||
const actorEntry = await knex('actors').where({ slug: actorSlug }).first();
|
||||
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
|
||||
|
||||
const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested
|
||||
|
||||
const [siteEntries, networkEntries] = await Promise.all([
|
||||
knex('sites')
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.select(
|
||||
'sites.*',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||
)
|
||||
.whereIn('sites.slug', finalSources.flat()),
|
||||
knex('networks').select('*').whereIn('slug', finalSources.flat()),
|
||||
]);
|
||||
|
||||
const sites = await curateSites(siteEntries, true);
|
||||
const networks = networkEntries.map(network => ({ ...network, isFallback: true }));
|
||||
const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
|
||||
|
||||
const profiles = await scrapeProfiles(sources, actorName, actorEntry, sitesBySlug);
|
||||
const profile = await mergeProfiles(profiles, actorEntry);
|
||||
|
||||
if (profile === null) {
|
||||
logger.warn(`Could not find profile for actor '${actorName}'`);
|
||||
|
||||
if (argv.save && !actorEntry) {
|
||||
await storeActor({ name: actorName }, false, false);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
if (argv.inspect) {
|
||||
console.log(profile);
|
||||
logger.info(`Found ${profile.releases.length} releases for ${actorName}`);
|
||||
}
|
||||
|
||||
if (argv.save) {
|
||||
if (actorEntry && profile) {
|
||||
await Promise.all([
|
||||
updateActor(profile, true, true),
|
||||
storeAvatars(profile.avatars, actorEntry.id),
|
||||
]);
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
await storeActor(profile, true, true);
|
||||
}
|
||||
|
||||
return profile;
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
logger.warn(`${actorName}: ${error}`);
|
||||
|
||||
return null;
|
||||
}
|
||||
}, {
|
||||
concurrency: 3,
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeBasicActors() {
|
||||
const basicActors = await knex('actors').where('scraped_at', null);
|
||||
|
||||
return scrapeActors(basicActors.map(actor => actor.name));
|
||||
}
|
||||
|
||||
async function associateActors(mappedActors, releases) {
|
||||
const [existingActorEntries, existingAssociationEntries] = await Promise.all([
|
||||
knex('actors')
|
||||
.whereIn('name', Object.values(mappedActors).map(actor => actor.name))
|
||||
.orWhereIn('slug', Object.keys(mappedActors)),
|
||||
knex('releases_actors').whereIn('release_id', releases.map(release => release.id)),
|
||||
]);
|
||||
|
||||
const associations = await Promise.map(Object.entries(mappedActors), async ([actorSlug, actor]) => {
|
||||
try {
|
||||
const actorEntry = existingActorEntries.find(actorX => actorX.slug === actorSlug)
|
||||
|| await storeActor(actor);
|
||||
|
||||
// if a scene
|
||||
return Array.from(actor.releaseIds)
|
||||
.map(releaseId => ({
|
||||
release_id: releaseId,
|
||||
actor_id: actorEntry.id,
|
||||
}))
|
||||
.filter(association => !existingAssociationEntries
|
||||
// remove associations already in database
|
||||
.some(associationEntry => associationEntry.actor_id === association.actor_id
|
||||
&& associationEntry.release_id === association.release_id));
|
||||
} catch (error) {
|
||||
logger.error(actor.name, error);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
|
||||
await knex('releases_actors').insert(associations.filter(association => association).flat());
|
||||
|
||||
// basic actor scraping is failure prone, don't run together with actor association
|
||||
// await scrapebasicactors(),
|
||||
console.log(releaseIdsByActor);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
associateActors,
|
||||
fetchActors,
|
||||
scrapeActors,
|
||||
scrapeBasicActors,
|
||||
storeReleaseActors,
|
||||
};
|
||||
|
|
15
src/app.js
|
@ -5,7 +5,10 @@ const argv = require('./argv');
|
|||
const initServer = require('./web/server');
|
||||
|
||||
const knex = require('./knex');
|
||||
const fetchUpdates = require('./fetch-updates');
|
||||
const fetchUpdates = require('./updates');
|
||||
const fetchDeep = require('./deep');
|
||||
const { storeReleases } = require('./store-releases');
|
||||
// const { storeReleaseActors } = require('./actors');
|
||||
|
||||
async function init() {
|
||||
if (argv.server) {
|
||||
|
@ -13,7 +16,15 @@ async function init() {
|
|||
return;
|
||||
}
|
||||
|
||||
await fetchUpdates();
|
||||
const updateBaseReleases = (argv.scrape || argv.sites || argv.networks) && await fetchUpdates();
|
||||
|
||||
const updateDeepReleases = updateBaseReleases && await fetchDeep(updateBaseReleases);
|
||||
const argvDeepReleases = argv.scenes && await fetchDeep(argv.scenes);
|
||||
|
||||
await storeReleases([...(updateDeepReleases || []), ...(argvDeepReleases || [])]);
|
||||
|
||||
// await storeReleaseActors(updateReleases);
|
||||
|
||||
knex.destroy();
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,145 @@
|
|||
'use strict';
|
||||
|
||||
const argv = require('./argv');
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const scrapers = require('./scrapers/scrapers');
|
||||
const { curateSites } = require('./sites');
|
||||
const { curateNetworks } = require('./networks');
|
||||
|
||||
function urlToSiteSlug(url) {
|
||||
try {
|
||||
const slug = new URL(url)
|
||||
.hostname
|
||||
.match(/([\w-]+)\.\w+$/)?.[1];
|
||||
|
||||
return slug;
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to derive site slug from '${url}': ${error.message}`);
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function findSites(baseReleases) {
|
||||
const baseReleasesWithoutSite = baseReleases.filter(release => release.url && !release.site);
|
||||
|
||||
const siteSlugs = Array.from(new Set(
|
||||
baseReleasesWithoutSite
|
||||
.map(baseRelease => urlToSiteSlug(baseRelease.url))
|
||||
.filter(Boolean),
|
||||
));
|
||||
|
||||
const siteEntries = await knex('sites').whereIn('slug', siteSlugs);
|
||||
const networkEntries = await knex('networks').whereIn('slug', siteSlugs);
|
||||
|
||||
const sites = await curateSites(siteEntries, true, false);
|
||||
const networks = await curateNetworks(networkEntries, true, false, false);
|
||||
const markedNetworks = networks.map(network => ({ ...network, isFallback: true }));
|
||||
|
||||
const sitesBySlug = []
|
||||
.concat(sites, markedNetworks)
|
||||
.reduce((accSites, site) => ({ ...accSites, [site.slug]: site }), {});
|
||||
|
||||
return sitesBySlug;
|
||||
}
|
||||
|
||||
function toBaseReleases(baseReleasesOrUrls) {
|
||||
return baseReleasesOrUrls
|
||||
.map((baseReleaseOrUrl) => {
|
||||
if (baseReleaseOrUrl.url) {
|
||||
// base release with URL
|
||||
return {
|
||||
...baseReleaseOrUrl,
|
||||
deep: false,
|
||||
};
|
||||
}
|
||||
|
||||
if (/^http/.test(baseReleaseOrUrl)) {
|
||||
// URL
|
||||
return {
|
||||
url: baseReleaseOrUrl,
|
||||
deep: false,
|
||||
};
|
||||
}
|
||||
|
||||
if (typeof baseReleaseOrUrl === 'object' && !Array.isArray(baseReleaseOrUrl)) {
|
||||
// base release without URL, prepare for passthrough
|
||||
return {
|
||||
...baseReleaseOrUrl,
|
||||
deep: false,
|
||||
};
|
||||
}
|
||||
|
||||
logger.warn(`Malformed base release, discarding '${baseReleaseOrUrl}'`);
|
||||
return null;
|
||||
})
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
async function scrapeRelease(baseRelease, sites, type = 'scene') {
|
||||
const site = baseRelease.site || sites[urlToSiteSlug(baseRelease.url)];
|
||||
|
||||
if (!site) {
|
||||
logger.warn(`No site available for ${baseRelease.url}`);
|
||||
return baseRelease;
|
||||
}
|
||||
|
||||
if ((!baseRelease.url && !baseRelease.path) || !argv.deep) {
|
||||
return {
|
||||
...baseRelease,
|
||||
site,
|
||||
};
|
||||
}
|
||||
|
||||
const scraper = scrapers.releases[site.slug];
|
||||
|
||||
if (!scraper) {
|
||||
logger.warn(`Could not find scraper for ${baseRelease.url}`);
|
||||
return baseRelease;
|
||||
}
|
||||
|
||||
if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) {
|
||||
logger.warn(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
|
||||
return baseRelease;
|
||||
}
|
||||
|
||||
try {
|
||||
const scrapedRelease = type === 'scene'
|
||||
? await scraper.fetchScene(baseRelease.url, site, baseRelease)
|
||||
: await scraper.fetchMovie(baseRelease.url, site, baseRelease);
|
||||
|
||||
const mergedRelease = {
|
||||
...baseRelease,
|
||||
...scrapedRelease,
|
||||
deep: !!scrapedRelease,
|
||||
site,
|
||||
};
|
||||
|
||||
if (scrapedRelease && baseRelease?.tags) {
|
||||
mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags);
|
||||
}
|
||||
|
||||
console.log(mergedRelease);
|
||||
|
||||
return mergedRelease;
|
||||
} catch (error) {
|
||||
logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`);
|
||||
return baseRelease;
|
||||
}
|
||||
}
|
||||
|
||||
async function scrapeReleases(baseReleases, sites) {
|
||||
return Promise.all(baseReleases.map(baseRelease => scrapeRelease(baseRelease, sites)));
|
||||
}
|
||||
|
||||
async function fetchReleases(baseReleasesOrUrls) {
|
||||
const baseReleases = toBaseReleases(baseReleasesOrUrls);
|
||||
const sites = await findSites(baseReleases);
|
||||
|
||||
const deepReleases = await scrapeReleases(baseReleases, sites);
|
||||
|
||||
return deepReleases;
|
||||
}
|
||||
|
||||
module.exports = fetchReleases;
|
|
@ -4,29 +4,33 @@ const knex = require('./knex');
|
|||
const whereOr = require('./utils/where-or');
|
||||
const { fetchSites } = require('./sites');
|
||||
|
||||
async function curateNetwork(network, includeParameters = false) {
|
||||
const [sites, studios] = await Promise.all([
|
||||
fetchSites({ network_id: network.id }),
|
||||
knex('studios')
|
||||
.where({ network_id: network.id }),
|
||||
]);
|
||||
|
||||
return {
|
||||
async function curateNetwork(network, includeParameters = false, includeSites = true, includeStudios = false) {
|
||||
const curatedNetwork = {
|
||||
id: network.id,
|
||||
name: network.name,
|
||||
url: network.url,
|
||||
description: network.description,
|
||||
slug: network.slug,
|
||||
sites,
|
||||
parameters: includeParameters ? network.parameters : null,
|
||||
studios: studios.map(studio => ({
|
||||
};
|
||||
|
||||
if (includeSites) {
|
||||
curatedNetwork.sites = await fetchSites({ network_id: network.id });
|
||||
}
|
||||
|
||||
if (includeStudios) {
|
||||
const studios = await knex('studios').where({ network_id: network.id });
|
||||
|
||||
curatedNetwork.studios = studios.map(studio => ({
|
||||
id: studio.id,
|
||||
name: studio.name,
|
||||
url: studio.url,
|
||||
description: studio.description,
|
||||
slug: studio.slug,
|
||||
})),
|
||||
};
|
||||
}));
|
||||
}
|
||||
|
||||
return curatedNetwork;
|
||||
}
|
||||
|
||||
function curateNetworks(releases) {
|
||||
|
@ -69,6 +73,8 @@ async function fetchNetworksFromReleases() {
|
|||
}
|
||||
|
||||
module.exports = {
|
||||
curateNetwork,
|
||||
curateNetworks,
|
||||
fetchNetworks,
|
||||
fetchNetworksFromReleases,
|
||||
findNetworkByUrl,
|
||||
|
|
|
@ -15,7 +15,7 @@ const {
|
|||
storeMedia,
|
||||
associateMedia,
|
||||
} = require('./media');
|
||||
const { fetchSites, findSiteByUrl } = require('./sites');
|
||||
const { fetchSites } = require('./sites');
|
||||
const slugify = require('./utils/slugify');
|
||||
const capitalize = require('./utils/capitalize');
|
||||
|
||||
|
@ -174,16 +174,7 @@ async function attachChannelSite(release) {
|
|||
};
|
||||
}
|
||||
|
||||
try {
|
||||
const urlSite = await findSiteByUrl(release.channel.url || release.channel);
|
||||
|
||||
return {
|
||||
...release,
|
||||
site: urlSite,
|
||||
};
|
||||
} catch (error) {
|
||||
throw new Error(`Unable to derive channel site from generic URL: ${release.url}`);
|
||||
}
|
||||
throw new Error(`Unable to match channel '${release.channel.slug || release.channel}' from generic URL: ${release.url}`);
|
||||
}
|
||||
|
||||
async function attachStudio(release) {
|
||||
|
|
|
@ -90,7 +90,7 @@ async function scrapeProfile({ qu }, site, withScenes) {
|
|||
|
||||
const bio = qu.all('.stats li', true).reduce((acc, row) => {
|
||||
const [key, value] = row.split(':');
|
||||
return { ...acc, [slugify(key, { delimiter: '_' })]: value.trim() };
|
||||
return { ...acc, [slugify(key, '_')]: value.trim() };
|
||||
}, {});
|
||||
|
||||
if (bio.height) profile.height = feetInchesToCm(bio.height);
|
||||
|
@ -133,7 +133,7 @@ async function fetchScene(url, site) {
|
|||
}
|
||||
|
||||
async function fetchProfile(actorName, scraperSlug, site, include) {
|
||||
const actorSlugA = slugify(actorName, { delimiter: '' });
|
||||
const actorSlugA = slugify(actorName, '');
|
||||
const actorSlugB = slugify(actorName);
|
||||
|
||||
const resA = await get(`${site.url}/models/${actorSlugA}.html`);
|
||||
|
|
|
@ -43,7 +43,7 @@ function scrapeAll(html, site, upcoming) {
|
|||
const poster = `https:${$(element).find('.card-main-img').attr('data-src')}`;
|
||||
const photos = $(element).find('.card-overlay .image-under').map((photoIndex, photoElement) => `https:${$(photoElement).attr('data-src')}`).toArray();
|
||||
|
||||
const channel = slugify($(element).find('.collection').attr('title'), { delimiter: '' });
|
||||
const channel = slugify($(element).find('.collection').attr('title'), '');
|
||||
|
||||
return acc.concat({
|
||||
url,
|
||||
|
|
|
@ -61,7 +61,7 @@ function scrapeProfile({ q, qa, qtx }) {
|
|||
|
||||
const keys = qa('.model-descr_line:not(.model-descr_rait) p.text span', true);
|
||||
const values = qa('.model-descr_line:not(.model-descr_rait) p.text').map(el => qtx(el));
|
||||
const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, { delimiter: '_' })]: values[index] }), {});
|
||||
const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, '_')]: values[index] }), {});
|
||||
|
||||
if (bio.height) profile.height = Number(bio.height.match(/\((\d+)cm\)/)[1]);
|
||||
if (bio.weight) profile.weight = Number(bio.weight.match(/\((\d+)kg\)/)[1]);
|
||||
|
@ -122,7 +122,7 @@ async function fetchScene(url, site, release) {
|
|||
|
||||
async function fetchProfile(actorName, scraperSlug) {
|
||||
const actorSlug = slugify(actorName);
|
||||
const actorSlug2 = slugify(actorName, { delimiter: '' });
|
||||
const actorSlug2 = slugify(actorName, '');
|
||||
|
||||
const [url, url2] = ['cherrypimps', 'wildoncam'].includes(scraperSlug)
|
||||
? [`https://${scraperSlug}.com/models/${actorSlug}.html`, `https://${scraperSlug}.com/models/${actorSlug2}.html`]
|
||||
|
|
|
@ -74,7 +74,7 @@ async function fetchActorReleases(urls) {
|
|||
async function scrapeProfile(html, _url, actorName) {
|
||||
const { qu } = ex(html);
|
||||
|
||||
const keys = qu.all('.about-title', true).map(key => slugify(key, { delimiter: '_' }));
|
||||
const keys = qu.all('.about-title', true).map(key => slugify(key, '_'));
|
||||
const values = qu.all('.about-info').map((el) => {
|
||||
if (el.children.length > 0) {
|
||||
return Array.from(el.children, child => child.textContent.trim()).join(', ');
|
||||
|
|
|
@ -79,7 +79,7 @@ async function fetchScene(url, site) {
|
|||
}
|
||||
|
||||
async function fetchProfile(actorName, scraperSlug) {
|
||||
const actorSlug = slugify(actorName, { delimiter: '' });
|
||||
const actorSlug = slugify(actorName, '');
|
||||
const url = scraperSlug === 'povperverts'
|
||||
? `https://povperverts.net/models/${actorSlug}.html`
|
||||
: `https://${scraperSlug}.com/models/${actorSlug}.html`;
|
||||
|
|
|
@ -233,7 +233,7 @@ async function scrapeScene(html, url, site, baseRelease, mobileHtml) {
|
|||
release.tags = hasTrans ? [...rawTags, 'transsexual'] : rawTags;
|
||||
|
||||
const channel = data?.productionCompany?.name || $('.studioLink a, .siteLink a').attr('title')?.trim() || $('.siteNameSpan').text()?.trim().toLowerCase().replace('.com', '');
|
||||
if (channel) release.channel = slugify(channel, { delimiter: '' });
|
||||
if (channel) release.channel = slugify(channel, '');
|
||||
|
||||
if (videoData.picPreview && new URL(videoData.picPreview).pathname.length > 1) release.poster = videoData.picPreview; // sometimes links to just https://images02-fame.gammacdn.com/
|
||||
|
||||
|
|
|
@ -193,7 +193,7 @@ function scrapeSceneT1({ html, qu }, site, url, baseRelease, channelRegExp) {
|
|||
if (channel) {
|
||||
release.channel = {
|
||||
force: true,
|
||||
slug: slugify(channel, { delimiter: '' }),
|
||||
slug: slugify(channel, ''),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -239,7 +239,7 @@ function scrapeProfile({ el, qu }, site) {
|
|||
|
||||
return {
|
||||
...acc,
|
||||
[slugify(key, { delimiter: '_' })]: value.trim(),
|
||||
[slugify(key, '_')]: value.trim(),
|
||||
};
|
||||
}, {});
|
||||
|
||||
|
@ -272,7 +272,7 @@ function scrapeProfileT1({ el, qu }, site) {
|
|||
|
||||
return {
|
||||
...acc,
|
||||
[slugify(key, { delimiter: '_' })]: value.trim(),
|
||||
[slugify(key, '_')]: value.trim(),
|
||||
};
|
||||
}, {});
|
||||
|
||||
|
@ -308,7 +308,7 @@ function scrapeProfileTour({ el, qu }, site) {
|
|||
|
||||
return {
|
||||
...acc,
|
||||
[slugify(key, { delimiter: '_' })]: value.trim(),
|
||||
[slugify(key, '_')]: value.trim(),
|
||||
};
|
||||
}, {});
|
||||
|
||||
|
@ -382,7 +382,7 @@ async function fetchScene(url, site, baseRelease, beforeFetchLatest) {
|
|||
}
|
||||
|
||||
async function fetchProfile(actorName, scraperSlug, site) {
|
||||
const actorSlugA = slugify(actorName, { delimiter: '' });
|
||||
const actorSlugA = slugify(actorName, '');
|
||||
const actorSlugB = slugify(actorName);
|
||||
|
||||
const t1 = site.parameters?.t1 ? 't1/' : '';
|
||||
|
|
|
@ -384,8 +384,8 @@ async function fetchMovie(url, site) {
|
|||
}
|
||||
|
||||
async function fetchProfile(actorName) {
|
||||
const actorSlugA = slugify(actorName, { delimiter: '-' });
|
||||
const actorSlugB = slugify(actorName, { delimiter: '' });
|
||||
const actorSlugA = slugify(actorName, '-');
|
||||
const actorSlugB = slugify(actorName, '');
|
||||
|
||||
const urlA = `https://julesjordan.com/trial/models/${actorSlugA}.html`;
|
||||
const urlB = `https://julesjordan.com/trial/models/${actorSlugB}.html`;
|
||||
|
|
|
@ -98,7 +98,7 @@ function scrapeScene(data, url, _site, networkName) {
|
|||
}
|
||||
|
||||
const siteName = data.collections[0]?.name || data.brand;
|
||||
release.channel = slugify(siteName, { delimiter: '' });
|
||||
release.channel = slugify(siteName, '');
|
||||
|
||||
release.url = url || `https://www.${networkName || data.brand}.com/scene/${entryId}/`;
|
||||
|
||||
|
|
|
@ -94,7 +94,7 @@ function scrapeProfile({ qu }, _actorName, origin) {
|
|||
const keys = qu.all('.model-profile h5', true);
|
||||
const values = qu.all('.model-profile h5 + p', true);
|
||||
|
||||
const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, { delimiter: '_' })]: values[index] }), {});
|
||||
const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, '_')]: values[index] }), {});
|
||||
|
||||
profile.age = Number(bio.age);
|
||||
profile.description = qu.q('.model-bio', true);
|
||||
|
|
|
@ -95,7 +95,7 @@ async function scrapeScene(html, url, site) {
|
|||
release.movie = $('a[data-track="FULL MOVIE"]').attr('href');
|
||||
|
||||
const siteElement = $('.content-wrapper .logos-sites a');
|
||||
if (siteElement) release.channel = slugify(siteElement.text(), { delimiter: '' });
|
||||
if (siteElement) release.channel = slugify(siteElement.text(), '');
|
||||
|
||||
return release;
|
||||
}
|
||||
|
@ -108,7 +108,7 @@ function scrapeProfile({ html, q, qa, qtx }) {
|
|||
const trimmedValue = value.trim();
|
||||
|
||||
if (trimmedValue.length === 0 || trimmedValue === '-') return acc;
|
||||
return { ...acc, [slugify(key, { delimiter: '_' })]: trimmedValue };
|
||||
return { ...acc, [slugify(key, '_')]: trimmedValue };
|
||||
}, {});
|
||||
|
||||
const description = q('.model-facts-long', true);
|
||||
|
@ -176,7 +176,7 @@ async function fetchScene(url, site) {
|
|||
}
|
||||
|
||||
async function fetchProfile(actorName) {
|
||||
const actorSearchSlug = slugify(actorName, { delimiter: '+' });
|
||||
const actorSearchSlug = slugify(actorName, '+');
|
||||
const url = `https://www.private.com/search.php?query=${actorSearchSlug}`;
|
||||
const modelRes = await geta(url, '.model h3 a');
|
||||
|
||||
|
|
|
@ -155,7 +155,7 @@ async function scrapeProfile(html, actorUrl, withReleases) {
|
|||
|
||||
const bio = qa('.stat').reduce((acc, el) => {
|
||||
const prop = q(el, '.label', true).slice(0, -1);
|
||||
const key = slugify(prop, { delimiter: '_' });
|
||||
const key = slugify(prop, '_');
|
||||
const value = q(el, '.value', true);
|
||||
|
||||
return {
|
||||
|
|
22
src/sites.js
|
@ -7,19 +7,13 @@ const argv = require('./argv');
|
|||
const knex = require('./knex');
|
||||
const whereOr = require('./utils/where-or');
|
||||
|
||||
async function curateSite(site, includeParameters = false) {
|
||||
const tags = await knex('sites_tags')
|
||||
.select('tags.*', 'sites_tags.inherit')
|
||||
.where('site_id', site.id)
|
||||
.join('tags', 'tags.id', 'sites_tags.tag_id');
|
||||
|
||||
return {
|
||||
async function curateSite(site, includeParameters = false, includeTags = true) {
|
||||
const curatedSite = {
|
||||
id: site.id,
|
||||
name: site.name,
|
||||
url: site.url,
|
||||
description: site.description,
|
||||
slug: site.slug,
|
||||
tags,
|
||||
independent: !!site.parameters && site.parameters.independent,
|
||||
parameters: includeParameters ? site.parameters : null,
|
||||
network: {
|
||||
|
@ -31,6 +25,15 @@ async function curateSite(site, includeParameters = false) {
|
|||
parameters: includeParameters ? site.network_parameters : null,
|
||||
},
|
||||
};
|
||||
|
||||
if (includeTags) {
|
||||
curatedSite.tags = await knex('sites_tags')
|
||||
.select('tags.*', 'sites_tags.inherit')
|
||||
.where('site_id', site.id)
|
||||
.join('tags', 'tags.id', 'sites_tags.tag_id');
|
||||
}
|
||||
|
||||
return curatedSite;
|
||||
}
|
||||
|
||||
function curateSites(sites, includeParameters) {
|
||||
|
@ -78,7 +81,7 @@ async function findSiteByUrl(url) {
|
|||
.first();
|
||||
|
||||
if (site) {
|
||||
const curatedSite = curateSite(site, true);
|
||||
const curatedSite = curateSite(site, true, false);
|
||||
|
||||
return curatedSite;
|
||||
}
|
||||
|
@ -182,6 +185,7 @@ async function fetchSitesFromReleases() {
|
|||
}
|
||||
|
||||
module.exports = {
|
||||
curateSite,
|
||||
curateSites,
|
||||
fetchIncludedSites,
|
||||
fetchSites,
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
|
||||
const knex = require('./knex');
|
||||
const slugify = require('./utils/slugify');
|
||||
|
||||
function curateReleaseEntry(release, batchId, existingRelease) {
|
||||
const slug = slugify(release.title, '-', {
|
||||
encode: true,
|
||||
limit: config.titleSlugLength,
|
||||
});
|
||||
|
||||
const curatedRelease = {
|
||||
title: release.title,
|
||||
entry_id: release.entryId || null,
|
||||
site_id: release.site.id,
|
||||
shoot_id: release.shootId || null,
|
||||
studio_id: release.studio?.id || null,
|
||||
url: release.url,
|
||||
date: release.date,
|
||||
slug,
|
||||
description: release.description,
|
||||
duration: release.duration,
|
||||
type: release.type,
|
||||
// director: release.director,
|
||||
// likes: release.rating && release.rating.likes,
|
||||
// dislikes: release.rating && release.rating.dislikes,
|
||||
// rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
|
||||
deep: typeof release.deep === 'boolean' ? release.deep : false,
|
||||
deep_url: release.deepUrl,
|
||||
updated_batch_id: batchId,
|
||||
};
|
||||
|
||||
if (!existingRelease) {
|
||||
curatedRelease.created_batch_id = batchId;
|
||||
}
|
||||
|
||||
return curatedRelease;
|
||||
}
|
||||
|
||||
async function attachSite(releases) {
|
||||
const releasesWithoutSite = releases.filter(release => !release.site || release.site.isFallback);
|
||||
|
||||
// console.log(releases, releasesWithoutSite);
|
||||
}
|
||||
|
||||
async function extractUniqueReleases(releases) {
|
||||
const duplicateReleaseEntries = await knex('releases')
|
||||
.whereIn(['entry_id', 'site_id'], releases.map(release => [release.entryId, release.site.id]));
|
||||
|
||||
const duplicateReleaseEntryKeys = new Set(duplicateReleaseEntries.map(releaseEntry => `${releaseEntry.site_id}_${releaseEntry.entry_id}`));
|
||||
const uniqueReleases = releases.filter(release => !duplicateReleaseEntryKeys.has(`${release.site.id}_${release.entryId}`));
|
||||
|
||||
return uniqueReleases;
|
||||
}
|
||||
|
||||
async function storeReleases(releases) {
|
||||
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
|
||||
|
||||
const uniqueReleases = await extractUniqueReleases(releases);
|
||||
const releasesWithSites = await attachSite(releases);
|
||||
|
||||
const curatedReleaseEntries = uniqueReleases.slice(0, 2).map(release => curateReleaseEntry(release, batchId));
|
||||
|
||||
await knex('releases').insert(curatedReleaseEntries);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
storeReleases,
|
||||
};
|
|
@ -83,7 +83,10 @@ async function scrapeLatestReleases(scraper, site, preData) {
|
|||
const latestReleasesWithSite = latestReleases.map(release => ({ ...release, site: release.site || site })); // attach site release is assigned to when stored
|
||||
const oldestReleaseOnPage = latestReleases.sort((releaseA, releaseB) => releaseB.date - releaseA.date).slice(-1)[0];
|
||||
|
||||
const uniqueReleases = await extractUniqueReleases(latestReleasesWithSite, accReleases);
|
||||
const uniqueReleases = argv.redownload
|
||||
? latestReleasesWithSite
|
||||
: await extractUniqueReleases(latestReleasesWithSite, accReleases);
|
||||
|
||||
const pageAccReleases = accReleases.concat(uniqueReleases);
|
||||
|
||||
logger.verbose(`Scraped '${site.name}' (${site.network.name}) page ${page}, found ${uniqueReleases.length} unique releases`);
|
||||
|
@ -204,7 +207,9 @@ async function fetchUpdates() {
|
|||
{ concurrency: 5 },
|
||||
);
|
||||
|
||||
return scrapedNetworks;
|
||||
const releases = scrapedNetworks.flat(2);
|
||||
|
||||
return releases;
|
||||
}
|
||||
|
||||
module.exports = fetchUpdates;
|
|
@ -1,13 +1,14 @@
|
|||
'use strict';
|
||||
|
||||
function slugify(string, {
|
||||
function slugify(string, delimiter = '-', {
|
||||
encode = false,
|
||||
delimiter = '-',
|
||||
limit = 1000,
|
||||
} = {}) {
|
||||
const slugComponents = string.trim().toLowerCase().match(/\w+/g);
|
||||
|
||||
if (!slugComponents) return '';
|
||||
if (!slugComponents) {
|
||||
return '';
|
||||
}
|
||||
|
||||
const slug = slugComponents.reduce((acc, component, index) => {
|
||||
const accSlug = `${acc}${index > 0 ? delimiter : ''}${component}`;
|
||||
|
|