Refactoring deep scrape. Added tag posters.
After Width: | Height: | Size: 5.0 MiB |
After Width: | Height: | Size: 96 KiB |
After Width: | Height: | Size: 1.0 MiB |
After Width: | Height: | Size: 102 KiB |
After Width: | Height: | Size: 1.3 MiB |
After Width: | Height: | Size: 94 KiB |
Before Width: | Height: | Size: 101 KiB |
Before Width: | Height: | Size: 17 KiB |
|
@ -6,6 +6,7 @@ const tagPosters = [
|
||||||
['bdsm', 0, 'Dani Daniels in "The Traning of Dani Daniels, Day 2" for The Training of O at Kink'],
|
['bdsm', 0, 'Dani Daniels in "The Traning of Dani Daniels, Day 2" for The Training of O at Kink'],
|
||||||
['behind-the-scenes', 0, 'Janice Griffith in "Day With A Pornstar: Janice" for Brazzers'],
|
['behind-the-scenes', 0, 'Janice Griffith in "Day With A Pornstar: Janice" for Brazzers'],
|
||||||
['da-tp', 0, 'Natasha Teen in LegalPorno SZ2164'],
|
['da-tp', 0, 'Natasha Teen in LegalPorno SZ2164'],
|
||||||
|
['deepthroat', 0, 'Chanel Grey in "Deepthroating Is Fun" for Throated'],
|
||||||
['double-anal', 5, 'Riley Reid in "The Gangbang of Riley Reid" for Jules Jordan'],
|
['double-anal', 5, 'Riley Reid in "The Gangbang of Riley Reid" for Jules Jordan'],
|
||||||
['double-penetration', 'poster', 'Mia Malkova in "DP!" for HardX'],
|
['double-penetration', 'poster', 'Mia Malkova in "DP!" for HardX'],
|
||||||
['double-vaginal', 'poster', 'Riley Reid in "Pizza That Ass" for Reid My Lips'],
|
['double-vaginal', 'poster', 'Riley Reid in "Pizza That Ass" for Reid My Lips'],
|
||||||
|
@ -13,7 +14,7 @@ const tagPosters = [
|
||||||
['oral-creampie', 1, 'Keisha Grey in Brazzers House'],
|
['oral-creampie', 1, 'Keisha Grey in Brazzers House'],
|
||||||
['triple-anal', 'poster', 'Kristy Black in SZ1986 for LegalPorno'],
|
['triple-anal', 'poster', 'Kristy Black in SZ1986 for LegalPorno'],
|
||||||
['airtight', 1, 'Jynx Maze in "Pump My Ass Full of Cum 3" for Jules Jordan'],
|
['airtight', 1, 'Jynx Maze in "Pump My Ass Full of Cum 3" for Jules Jordan'],
|
||||||
['anal', 'poster', 'Jynx Maze in "Anal Buffet 6" for Evil Angel'],
|
['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'],
|
||||||
['asian', 'poster', 'Vina Sky in "Young and Glamorous 10" for Jules Jordan'],
|
['asian', 'poster', 'Vina Sky in "Young and Glamorous 10" for Jules Jordan'],
|
||||||
['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'],
|
['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'],
|
||||||
['blowbang', 'poster'],
|
['blowbang', 'poster'],
|
||||||
|
@ -27,7 +28,7 @@ const tagPosters = [
|
||||||
['gaping', 1, 'Vina Sky in "Vina Sky Does Anal" for HardX'],
|
['gaping', 1, 'Vina Sky in "Vina Sky Does Anal" for HardX'],
|
||||||
['interracial', 'poster'],
|
['interracial', 'poster'],
|
||||||
['latina', 'poster'],
|
['latina', 'poster'],
|
||||||
['mff', 'poster'],
|
['mff', 0, 'Madison Ivy and Adriana Chechik in "Day With A Pornstar" for Brazzers'],
|
||||||
['mfm', 'poster'],
|
['mfm', 'poster'],
|
||||||
['orgy', 'poster'],
|
['orgy', 'poster'],
|
||||||
['schoolgirl', 1, 'Eliza Ibarra for Brazzers'],
|
['schoolgirl', 1, 'Eliza Ibarra for Brazzers'],
|
||||||
|
@ -47,6 +48,7 @@ const tagPosters = [
|
||||||
const tagPhotos = [
|
const tagPhotos = [
|
||||||
['airtight', 2, 'Dakota Skye in "Dakota Goes Nuts" for ArchAngel'],
|
['airtight', 2, 'Dakota Skye in "Dakota Goes Nuts" for ArchAngel'],
|
||||||
['airtight', 3, 'Anita Bellini in "Triple Dick Gangbang" for Hands On Hardcore (DDF Network)'],
|
['airtight', 3, 'Anita Bellini in "Triple Dick Gangbang" for Hands On Hardcore (DDF Network)'],
|
||||||
|
['anal', 'poster', 'Jynx Maze in "Anal Buffet 6" for Evil Angel'],
|
||||||
['anal', 0],
|
['anal', 0],
|
||||||
['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'],
|
['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'],
|
||||||
['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'],
|
['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'],
|
||||||
|
|
|
@ -0,0 +1,539 @@
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
const config = require('config');
|
||||||
|
const Promise = require('bluebird');
|
||||||
|
const UrlPattern = require('url-pattern');
|
||||||
|
const moment = require('moment');
|
||||||
|
|
||||||
|
const logger = require('./logger')(__filename);
|
||||||
|
const knex = require('./knex');
|
||||||
|
const argv = require('./argv');
|
||||||
|
const include = require('./utils/argv-include')(argv);
|
||||||
|
const scrapers = require('./scrapers/scrapers');
|
||||||
|
const whereOr = require('./utils/where-or');
|
||||||
|
const resolvePlace = require('./utils/resolve-place');
|
||||||
|
const slugify = require('./utils/slugify');
|
||||||
|
const capitalize = require('./utils/capitalize');
|
||||||
|
const { curateSites } = require('./sites');
|
||||||
|
const { storeMedia, associateMedia } = require('./media');
|
||||||
|
|
||||||
|
async function curateActor(actor) {
|
||||||
|
const [aliases, avatar, photos, social] = await Promise.all([
|
||||||
|
knex('actors').where({ alias_for: actor.id }),
|
||||||
|
knex('actors_avatars')
|
||||||
|
.where('actor_id', actor.id)
|
||||||
|
.join('media', 'media.id', 'actors_avatars.media_id')
|
||||||
|
.first(),
|
||||||
|
knex('actors_photos')
|
||||||
|
.where('actor_id', actor.id)
|
||||||
|
.join('media', 'media.id', 'actors_photos.media_id')
|
||||||
|
.orderBy('index'),
|
||||||
|
knex('actors_social')
|
||||||
|
.where('actor_id', actor.id)
|
||||||
|
.orderBy('platform', 'desc'),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const curatedActor = {
|
||||||
|
id: actor.id,
|
||||||
|
gender: actor.gender,
|
||||||
|
name: actor.name,
|
||||||
|
description: actor.description,
|
||||||
|
birthdate: actor.birthdate && new Date(actor.birthdate),
|
||||||
|
country: actor.country_alpha2,
|
||||||
|
origin: (actor.birth_city || actor.birth_state || actor.birth_country_alpha2) ? {} : null,
|
||||||
|
residence: (actor.residence_city || actor.residence_state || actor.residence_country_alpha2) ? {} : null,
|
||||||
|
ethnicity: actor.ethnicity,
|
||||||
|
height: actor.height,
|
||||||
|
weight: actor.weight,
|
||||||
|
bust: actor.bust,
|
||||||
|
waist: actor.waist,
|
||||||
|
hip: actor.hip,
|
||||||
|
naturalBoobs: actor.natural_boobs,
|
||||||
|
aliases: aliases.map(({ name }) => name),
|
||||||
|
slug: actor.slug,
|
||||||
|
avatar,
|
||||||
|
photos,
|
||||||
|
hasTattoos: actor.has_tattoos,
|
||||||
|
hasPiercings: actor.has_piercings,
|
||||||
|
tattoos: actor.tattoos,
|
||||||
|
piercings: actor.piercings,
|
||||||
|
social,
|
||||||
|
scrapedAt: actor.scraped_at,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (curatedActor.birthdate) {
|
||||||
|
curatedActor.age = moment().diff(curatedActor.birthdate, 'years');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (actor.birth_city) curatedActor.origin.city = actor.birth_city;
|
||||||
|
if (actor.birth_state) curatedActor.origin.state = actor.birth_state;
|
||||||
|
|
||||||
|
if (actor.birth_country_alpha2) {
|
||||||
|
curatedActor.origin.country = {
|
||||||
|
alpha2: actor.birth_country_alpha2,
|
||||||
|
name: actor.birth_country_name,
|
||||||
|
alias: actor.birth_country_alias,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (actor.residence_city) curatedActor.residence.city = actor.residence_city;
|
||||||
|
if (actor.residence_state) curatedActor.residence.state = actor.residence_state;
|
||||||
|
|
||||||
|
if (actor.residence_country_alpha2) {
|
||||||
|
curatedActor.residence.country = {
|
||||||
|
alpha2: actor.residence_country_alpha2,
|
||||||
|
name: actor.residence_country_name,
|
||||||
|
alias: actor.residence_country_alias,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return curatedActor;
|
||||||
|
}
|
||||||
|
|
||||||
|
function curateActors(releases) {
|
||||||
|
return Promise.all(releases.map(async release => curateActor(release)));
|
||||||
|
}
|
||||||
|
|
||||||
|
function curateActorEntry(actor, scraped, scrapeSuccess) {
|
||||||
|
const curatedActor = {
|
||||||
|
name: capitalize(actor.name),
|
||||||
|
slug: slugify(actor.name),
|
||||||
|
birthdate: actor.birthdate,
|
||||||
|
description: actor.description,
|
||||||
|
gender: actor.gender,
|
||||||
|
ethnicity: actor.ethnicity,
|
||||||
|
bust: actor.bust,
|
||||||
|
waist: actor.waist,
|
||||||
|
hip: actor.hip,
|
||||||
|
natural_boobs: actor.naturalBoobs,
|
||||||
|
height: actor.height,
|
||||||
|
weight: actor.weight,
|
||||||
|
hair: actor.hair,
|
||||||
|
eyes: actor.eyes,
|
||||||
|
has_tattoos: actor.hasTattoos,
|
||||||
|
has_piercings: actor.hasPiercings,
|
||||||
|
tattoos: actor.tattoos,
|
||||||
|
piercings: actor.piercings,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (actor.id) {
|
||||||
|
curatedActor.id = actor.id;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (actor.birthPlace) {
|
||||||
|
curatedActor.birth_city = actor.birthPlace.city;
|
||||||
|
curatedActor.birth_state = actor.birthPlace.state;
|
||||||
|
curatedActor.birth_country_alpha2 = actor.birthPlace.country;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (actor.residencePlace) {
|
||||||
|
curatedActor.residence_city = actor.residencePlace.city;
|
||||||
|
curatedActor.residence_state = actor.residencePlace.state;
|
||||||
|
curatedActor.residence_country_alpha2 = actor.residencePlace.country;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (scraped) {
|
||||||
|
curatedActor.scraped_at = new Date();
|
||||||
|
curatedActor.scrape_success = scrapeSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
return curatedActor;
|
||||||
|
}
|
||||||
|
|
||||||
|
function curateSocialEntry(url, actorId) {
|
||||||
|
const platforms = [
|
||||||
|
// links supplied by PH often look like domain.com/domain.com/username
|
||||||
|
{
|
||||||
|
label: 'twitter',
|
||||||
|
pattern: 'http(s)\\://(*)twitter.com/:username(/)(?*)',
|
||||||
|
format: username => `https://www.twitter.com/${username}`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'youtube',
|
||||||
|
pattern: 'http(s)\\://(*)youtube.com/channel/:username(?*)',
|
||||||
|
format: username => `https://www.youtube.com/channel/${username}`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'instagram',
|
||||||
|
pattern: 'http(s)\\://(*)instagram.com/:username(/)(?*)',
|
||||||
|
format: username => `https://www.instagram.com/${username}`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'snapchat',
|
||||||
|
pattern: 'http(s)\\://(*)snapchat.com/add/:username(/)(?*)',
|
||||||
|
format: username => `https://www.snapchat.com/add/${username}`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'tumblr',
|
||||||
|
pattern: 'http(s)\\://:username.tumblr.com(*)',
|
||||||
|
format: username => `https://${username}.tumblr.com`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'onlyfans',
|
||||||
|
pattern: 'http(s)\\://(*)onlyfans.com/:username(/)(?*)',
|
||||||
|
format: username => `https://www.onlyfans.com/${username}`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'fancentro',
|
||||||
|
pattern: 'http(s)\\://(*)fancentro.com/:username(/)(?*)',
|
||||||
|
format: username => `https://www.fancentro.com/${username}`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'modelhub',
|
||||||
|
pattern: 'http(s)\\://(*)modelhub.com/:username(/)(?*)',
|
||||||
|
format: username => `https://www.modelhub.com/${username}`,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const match = platforms.reduce((acc, platform) => {
|
||||||
|
if (acc) return acc;
|
||||||
|
|
||||||
|
const patternMatch = new UrlPattern(platform.pattern).match(url);
|
||||||
|
|
||||||
|
if (patternMatch) {
|
||||||
|
return {
|
||||||
|
platform: platform.label,
|
||||||
|
original: url,
|
||||||
|
username: patternMatch.username,
|
||||||
|
url: platform.format ? platform.format(patternMatch.username) : url,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}, null) || { url };
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: match.url,
|
||||||
|
platform: match.platform,
|
||||||
|
actor_id: actorId,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function curateSocialEntries(urls, actorId) {
|
||||||
|
if (!urls) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const existingSocialLinks = await knex('actors_social').where('actor_id', actorId);
|
||||||
|
|
||||||
|
return urls.reduce((acc, url) => {
|
||||||
|
const socialEntry = curateSocialEntry(url, actorId);
|
||||||
|
|
||||||
|
if (acc.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase()) || existingSocialLinks.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase())) {
|
||||||
|
// prevent duplicates
|
||||||
|
return acc;
|
||||||
|
}
|
||||||
|
|
||||||
|
return [...acc, socialEntry];
|
||||||
|
}, []);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchActors(queryObject, limit = 100) {
|
||||||
|
const releases = await knex('actors')
|
||||||
|
.select(
|
||||||
|
'actors.*',
|
||||||
|
'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name', 'birth_countries.alias as birth_country_alias',
|
||||||
|
'residence_countries.alpha2 as residence_country_alpha2', 'residence_countries.name as residence_country_name', 'residence_countries.alias as residence_country_alias',
|
||||||
|
)
|
||||||
|
.leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2')
|
||||||
|
.leftJoin('countries as residence_countries', 'actors.residence_country_alpha2', 'residence_countries.alpha2')
|
||||||
|
.orderBy(['actors.name', 'actors.gender'])
|
||||||
|
.where(builder => whereOr(queryObject, 'actors', builder))
|
||||||
|
.limit(limit);
|
||||||
|
|
||||||
|
return curateActors(releases);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function storeSocialLinks(urls, actorId) {
|
||||||
|
const curatedSocialEntries = await curateSocialEntries(urls, actorId);
|
||||||
|
|
||||||
|
await knex('actors_social').insert(curatedSocialEntries);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function storeAvatars(avatars, actorId) {
|
||||||
|
if (!avatars || avatars.length === 0) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const avatarsBySource = await storeMedia(avatars, 'actor', 'avatar');
|
||||||
|
await associateMedia({ [actorId]: avatars }, avatarsBySource, 'actor', 'photo', 'avatar');
|
||||||
|
|
||||||
|
return avatarsBySource;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function storeActor(actor, scraped = false, scrapeSuccess = false) {
|
||||||
|
const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess);
|
||||||
|
|
||||||
|
const [actorEntry] = await knex('actors')
|
||||||
|
.insert(curatedActor)
|
||||||
|
.returning('*');
|
||||||
|
|
||||||
|
await storeSocialLinks(actor.social, actorEntry.id);
|
||||||
|
|
||||||
|
if (actor.avatars) {
|
||||||
|
await storeAvatars(actor.avatars, actorEntry.id);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(`Added new entry for actor '${actor.name}'`);
|
||||||
|
|
||||||
|
return actorEntry;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function updateActor(actor, scraped = false, scrapeSuccess = false) {
|
||||||
|
const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess);
|
||||||
|
|
||||||
|
const [actorEntry] = await knex('actors')
|
||||||
|
.where({ id: actor.id })
|
||||||
|
.update(curatedActor)
|
||||||
|
.returning('*');
|
||||||
|
|
||||||
|
await storeSocialLinks(actor.social, actor.id);
|
||||||
|
|
||||||
|
logger.info(`Updated entry for actor '${actor.name}'`);
|
||||||
|
|
||||||
|
return actorEntry;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function mergeProfiles(profiles, actor) {
|
||||||
|
if (profiles.filter(Boolean).length === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const mergedProfile = profiles.reduce((prevProfile, profile) => {
|
||||||
|
if (profile === null) {
|
||||||
|
return prevProfile;
|
||||||
|
}
|
||||||
|
|
||||||
|
const accProfile = {
|
||||||
|
id: actor ? actor.id : null,
|
||||||
|
name: actor ? actor.name : (prevProfile.name || profile.name),
|
||||||
|
description: prevProfile.description || profile.description,
|
||||||
|
gender: prevProfile.gender || profile.gender,
|
||||||
|
birthdate: !prevProfile.birthdate || Number.isNaN(Number(prevProfile.birthdate)) ? profile.birthdate : prevProfile.birthdate,
|
||||||
|
birthPlace: prevProfile.birthPlace || profile.birthPlace,
|
||||||
|
residencePlace: prevProfile.residencePlace || profile.residencePlace,
|
||||||
|
nationality: prevProfile.nationality || profile.nationality, // used to derive country when not available
|
||||||
|
ethnicity: prevProfile.ethnicity || profile.ethnicity,
|
||||||
|
bust: prevProfile.bust || (/\d+\w+/.test(profile.bust) ? profile.bust : null),
|
||||||
|
waist: prevProfile.waist || profile.waist,
|
||||||
|
hip: prevProfile.hip || profile.hip,
|
||||||
|
naturalBoobs: prevProfile.naturalBoobs === undefined ? profile.naturalBoobs : prevProfile.naturalBoobs,
|
||||||
|
height: prevProfile.height || profile.height,
|
||||||
|
weight: prevProfile.weight || profile.weight,
|
||||||
|
hair: prevProfile.hair || profile.hair,
|
||||||
|
eyes: prevProfile.eyes || profile.eyes,
|
||||||
|
hasPiercings: prevProfile.hasPiercings === undefined ? profile.hasPiercings : prevProfile.hasPiercings,
|
||||||
|
hasTattoos: prevProfile.hasTattoos === undefined ? profile.hasTattoos : prevProfile.hasTattoos,
|
||||||
|
piercings: prevProfile.piercings || profile.piercings,
|
||||||
|
tattoos: prevProfile.tattoos || profile.tattoos,
|
||||||
|
social: prevProfile.social.concat(profile.social || []),
|
||||||
|
releases: prevProfile.releases.concat(profile.releases ? profile.releases : []), // don't flatten fallbacks
|
||||||
|
};
|
||||||
|
|
||||||
|
if (profile.avatar) {
|
||||||
|
const avatar = Array.isArray(profile.avatar)
|
||||||
|
? profile.avatar.map(avatarX => ({
|
||||||
|
src: avatarX.src || avatarX,
|
||||||
|
scraper: profile.scraper,
|
||||||
|
copyright: avatarX.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
|
||||||
|
}))
|
||||||
|
: {
|
||||||
|
src: profile.avatar.src || profile.avatar,
|
||||||
|
scraper: profile.scraper,
|
||||||
|
copyright: profile.avatar.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
|
||||||
|
};
|
||||||
|
|
||||||
|
accProfile.avatars = prevProfile.avatars.concat([avatar]); // don't flatten fallbacks
|
||||||
|
} else {
|
||||||
|
accProfile.avatars = prevProfile.avatars;
|
||||||
|
}
|
||||||
|
|
||||||
|
return accProfile;
|
||||||
|
}, {
|
||||||
|
social: [],
|
||||||
|
avatars: [],
|
||||||
|
releases: [],
|
||||||
|
});
|
||||||
|
|
||||||
|
const [birthPlace, residencePlace] = await Promise.all([
|
||||||
|
resolvePlace(mergedProfile.birthPlace),
|
||||||
|
resolvePlace(mergedProfile.residencePlace),
|
||||||
|
]);
|
||||||
|
|
||||||
|
mergedProfile.birthPlace = birthPlace;
|
||||||
|
mergedProfile.residencePlace = residencePlace;
|
||||||
|
|
||||||
|
if (!mergedProfile.birthPlace && mergedProfile.nationality) {
|
||||||
|
const country = await knex('countries')
|
||||||
|
.where('nationality', 'ilike', `%${mergedProfile.nationality}%`)
|
||||||
|
.orderBy('priority', 'desc')
|
||||||
|
.first();
|
||||||
|
|
||||||
|
mergedProfile.birthPlace = {
|
||||||
|
country: country.alpha2,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return mergedProfile;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapeProfiles(sources, actorName, actorEntry, sitesBySlug) {
|
||||||
|
return Promise.map(sources, async (source) => {
|
||||||
|
// const [scraperSlug, scraper] = source;
|
||||||
|
const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] }));
|
||||||
|
|
||||||
|
try {
|
||||||
|
return await profileScrapers.reduce(async (outcome, { scraper, scraperSlug }) => outcome.catch(async () => {
|
||||||
|
if (!scraper) {
|
||||||
|
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
||||||
|
throw Object.assign(new Error(`No profile scraper available for ${scraperSlug}`));
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.verbose(`Searching '${actorName}' on ${scraperSlug}`);
|
||||||
|
|
||||||
|
const site = sitesBySlug[scraperSlug] || null;
|
||||||
|
const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, site, include);
|
||||||
|
|
||||||
|
if (profile) {
|
||||||
|
logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
...profile,
|
||||||
|
name: actorName,
|
||||||
|
scraper: scraperSlug,
|
||||||
|
site,
|
||||||
|
releases: profile.releases?.map(release => (typeof release === 'string'
|
||||||
|
? { url: release, site }
|
||||||
|
: { ...release, site: release.site || site }
|
||||||
|
)),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.verbose(`No profile for '${actorName}' available on ${scraperSlug}`);
|
||||||
|
throw Object.assign(new Error(`Profile for ${actorName} not available on ${scraperSlug}`), { warn: false });
|
||||||
|
}), Promise.reject(new Error()));
|
||||||
|
} catch (error) {
|
||||||
|
if (error.warn !== false) {
|
||||||
|
logger.warn(`Error in scraper ${source}: ${error.message}`);
|
||||||
|
// logger.error(error.stack);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapeActors(actorNames) {
|
||||||
|
return Promise.map(actorNames || argv.actors, async (actorName) => {
|
||||||
|
try {
|
||||||
|
const actorSlug = slugify(actorName);
|
||||||
|
const actorEntry = await knex('actors').where({ slug: actorSlug }).first();
|
||||||
|
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
|
||||||
|
|
||||||
|
const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested
|
||||||
|
|
||||||
|
const [siteEntries, networkEntries] = await Promise.all([
|
||||||
|
knex('sites')
|
||||||
|
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||||
|
.select(
|
||||||
|
'sites.*',
|
||||||
|
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||||
|
)
|
||||||
|
.whereIn('sites.slug', finalSources.flat()),
|
||||||
|
knex('networks').select('*').whereIn('slug', finalSources.flat()),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const sites = await curateSites(siteEntries, true);
|
||||||
|
const networks = networkEntries.map(network => ({ ...network, isFallback: true }));
|
||||||
|
const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
|
||||||
|
|
||||||
|
const profiles = await scrapeProfiles(sources, actorName, actorEntry, sitesBySlug);
|
||||||
|
const profile = await mergeProfiles(profiles, actorEntry);
|
||||||
|
|
||||||
|
if (profile === null) {
|
||||||
|
logger.warn(`Could not find profile for actor '${actorName}'`);
|
||||||
|
|
||||||
|
if (argv.save && !actorEntry) {
|
||||||
|
await storeActor({ name: actorName }, false, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argv.inspect) {
|
||||||
|
console.log(profile);
|
||||||
|
logger.info(`Found ${profile.releases.length} releases for ${actorName}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argv.save) {
|
||||||
|
if (actorEntry && profile) {
|
||||||
|
await Promise.all([
|
||||||
|
updateActor(profile, true, true),
|
||||||
|
storeAvatars(profile.avatars, actorEntry.id),
|
||||||
|
]);
|
||||||
|
|
||||||
|
return profile;
|
||||||
|
}
|
||||||
|
|
||||||
|
await storeActor(profile, true, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
return profile;
|
||||||
|
} catch (error) {
|
||||||
|
console.log(error);
|
||||||
|
logger.warn(`${actorName}: ${error}`);
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}, {
|
||||||
|
concurrency: 3,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapeBasicActors() {
|
||||||
|
const basicActors = await knex('actors').where('scraped_at', null);
|
||||||
|
|
||||||
|
return scrapeActors(basicActors.map(actor => actor.name));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function associateActors(mappedActors, releases) {
|
||||||
|
const [existingActorEntries, existingAssociationEntries] = await Promise.all([
|
||||||
|
knex('actors')
|
||||||
|
.whereIn('name', Object.values(mappedActors).map(actor => actor.name))
|
||||||
|
.orWhereIn('slug', Object.keys(mappedActors)),
|
||||||
|
knex('releases_actors').whereIn('release_id', releases.map(release => release.id)),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const associations = await Promise.map(Object.entries(mappedActors), async ([actorSlug, actor]) => {
|
||||||
|
try {
|
||||||
|
const actorEntry = existingActorEntries.find(actorX => actorX.slug === actorSlug)
|
||||||
|
|| await storeActor(actor);
|
||||||
|
|
||||||
|
// if a scene
|
||||||
|
return Array.from(actor.releaseIds)
|
||||||
|
.map(releaseId => ({
|
||||||
|
release_id: releaseId,
|
||||||
|
actor_id: actorEntry.id,
|
||||||
|
}))
|
||||||
|
.filter(association => !existingAssociationEntries
|
||||||
|
// remove associations already in database
|
||||||
|
.some(associationEntry => associationEntry.actor_id === association.actor_id
|
||||||
|
&& associationEntry.release_id === association.release_id));
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(actor.name, error);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
await knex('releases_actors').insert(associations.filter(association => association).flat());
|
||||||
|
|
||||||
|
// basic actor scraping is failure prone, don't run together with actor association
|
||||||
|
// await scrapebasicactors(),
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
associateActors,
|
||||||
|
fetchActors,
|
||||||
|
scrapeActors,
|
||||||
|
scrapeBasicActors,
|
||||||
|
};
|
541
src/actors.js
|
@ -1,539 +1,26 @@
|
||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
const config = require('config');
|
|
||||||
const Promise = require('bluebird');
|
|
||||||
const UrlPattern = require('url-pattern');
|
|
||||||
const moment = require('moment');
|
|
||||||
|
|
||||||
const logger = require('./logger')(__filename);
|
|
||||||
const knex = require('./knex');
|
|
||||||
const argv = require('./argv');
|
|
||||||
const include = require('./utils/argv-include')(argv);
|
|
||||||
const scrapers = require('./scrapers/scrapers');
|
|
||||||
const whereOr = require('./utils/where-or');
|
|
||||||
const resolvePlace = require('./utils/resolve-place');
|
|
||||||
const slugify = require('./utils/slugify');
|
const slugify = require('./utils/slugify');
|
||||||
const capitalize = require('./utils/capitalize');
|
|
||||||
const { curateSites } = require('./sites');
|
|
||||||
const { storeMedia, associateMedia } = require('./media');
|
|
||||||
|
|
||||||
async function curateActor(actor) {
|
async function storeReleaseActors(releases) {
|
||||||
const [aliases, avatar, photos, social] = await Promise.all([
|
const releaseIdsByActor = releases.reduce(
|
||||||
knex('actors').where({ alias_for: actor.id }),
|
(acc, release) => release.actors.reduce((actorAcc, actor) => {
|
||||||
knex('actors_avatars')
|
const releaseActor = actor.name ? actor : { name: actor };
|
||||||
.where('actor_id', actor.id)
|
const actorSlug = slugify(releaseActor.name);
|
||||||
.join('media', 'media.id', 'actors_avatars.media_id')
|
|
||||||
.first(),
|
|
||||||
knex('actors_photos')
|
|
||||||
.where('actor_id', actor.id)
|
|
||||||
.join('media', 'media.id', 'actors_photos.media_id')
|
|
||||||
.orderBy('index'),
|
|
||||||
knex('actors_social')
|
|
||||||
.where('actor_id', actor.id)
|
|
||||||
.orderBy('platform', 'desc'),
|
|
||||||
]);
|
|
||||||
|
|
||||||
const curatedActor = {
|
|
||||||
id: actor.id,
|
|
||||||
gender: actor.gender,
|
|
||||||
name: actor.name,
|
|
||||||
description: actor.description,
|
|
||||||
birthdate: actor.birthdate && new Date(actor.birthdate),
|
|
||||||
country: actor.country_alpha2,
|
|
||||||
origin: (actor.birth_city || actor.birth_state || actor.birth_country_alpha2) ? {} : null,
|
|
||||||
residence: (actor.residence_city || actor.residence_state || actor.residence_country_alpha2) ? {} : null,
|
|
||||||
ethnicity: actor.ethnicity,
|
|
||||||
height: actor.height,
|
|
||||||
weight: actor.weight,
|
|
||||||
bust: actor.bust,
|
|
||||||
waist: actor.waist,
|
|
||||||
hip: actor.hip,
|
|
||||||
naturalBoobs: actor.natural_boobs,
|
|
||||||
aliases: aliases.map(({ name }) => name),
|
|
||||||
slug: actor.slug,
|
|
||||||
avatar,
|
|
||||||
photos,
|
|
||||||
hasTattoos: actor.has_tattoos,
|
|
||||||
hasPiercings: actor.has_piercings,
|
|
||||||
tattoos: actor.tattoos,
|
|
||||||
piercings: actor.piercings,
|
|
||||||
social,
|
|
||||||
scrapedAt: actor.scraped_at,
|
|
||||||
};
|
|
||||||
|
|
||||||
if (curatedActor.birthdate) {
|
|
||||||
curatedActor.age = moment().diff(curatedActor.birthdate, 'years');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (actor.birth_city) curatedActor.origin.city = actor.birth_city;
|
|
||||||
if (actor.birth_state) curatedActor.origin.state = actor.birth_state;
|
|
||||||
|
|
||||||
if (actor.birth_country_alpha2) {
|
|
||||||
curatedActor.origin.country = {
|
|
||||||
alpha2: actor.birth_country_alpha2,
|
|
||||||
name: actor.birth_country_name,
|
|
||||||
alias: actor.birth_country_alias,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
if (actor.residence_city) curatedActor.residence.city = actor.residence_city;
|
|
||||||
if (actor.residence_state) curatedActor.residence.state = actor.residence_state;
|
|
||||||
|
|
||||||
if (actor.residence_country_alpha2) {
|
|
||||||
curatedActor.residence.country = {
|
|
||||||
alpha2: actor.residence_country_alpha2,
|
|
||||||
name: actor.residence_country_name,
|
|
||||||
alias: actor.residence_country_alias,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return curatedActor;
|
|
||||||
}
|
|
||||||
|
|
||||||
function curateActors(releases) {
|
|
||||||
return Promise.all(releases.map(async release => curateActor(release)));
|
|
||||||
}
|
|
||||||
|
|
||||||
function curateActorEntry(actor, scraped, scrapeSuccess) {
|
|
||||||
const curatedActor = {
|
|
||||||
name: capitalize(actor.name),
|
|
||||||
slug: slugify(actor.name),
|
|
||||||
birthdate: actor.birthdate,
|
|
||||||
description: actor.description,
|
|
||||||
gender: actor.gender,
|
|
||||||
ethnicity: actor.ethnicity,
|
|
||||||
bust: actor.bust,
|
|
||||||
waist: actor.waist,
|
|
||||||
hip: actor.hip,
|
|
||||||
natural_boobs: actor.naturalBoobs,
|
|
||||||
height: actor.height,
|
|
||||||
weight: actor.weight,
|
|
||||||
hair: actor.hair,
|
|
||||||
eyes: actor.eyes,
|
|
||||||
has_tattoos: actor.hasTattoos,
|
|
||||||
has_piercings: actor.hasPiercings,
|
|
||||||
tattoos: actor.tattoos,
|
|
||||||
piercings: actor.piercings,
|
|
||||||
};
|
|
||||||
|
|
||||||
if (actor.id) {
|
|
||||||
curatedActor.id = actor.id;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (actor.birthPlace) {
|
|
||||||
curatedActor.birth_city = actor.birthPlace.city;
|
|
||||||
curatedActor.birth_state = actor.birthPlace.state;
|
|
||||||
curatedActor.birth_country_alpha2 = actor.birthPlace.country;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (actor.residencePlace) {
|
|
||||||
curatedActor.residence_city = actor.residencePlace.city;
|
|
||||||
curatedActor.residence_state = actor.residencePlace.state;
|
|
||||||
curatedActor.residence_country_alpha2 = actor.residencePlace.country;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (scraped) {
|
|
||||||
curatedActor.scraped_at = new Date();
|
|
||||||
curatedActor.scrape_success = scrapeSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
return curatedActor;
|
|
||||||
}
|
|
||||||
|
|
||||||
function curateSocialEntry(url, actorId) {
|
|
||||||
const platforms = [
|
|
||||||
// links supplied by PH often look like domain.com/domain.com/username
|
|
||||||
{
|
|
||||||
label: 'twitter',
|
|
||||||
pattern: 'http(s)\\://(*)twitter.com/:username(/)(?*)',
|
|
||||||
format: username => `https://www.twitter.com/${username}`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
label: 'youtube',
|
|
||||||
pattern: 'http(s)\\://(*)youtube.com/channel/:username(?*)',
|
|
||||||
format: username => `https://www.youtube.com/channel/${username}`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
label: 'instagram',
|
|
||||||
pattern: 'http(s)\\://(*)instagram.com/:username(/)(?*)',
|
|
||||||
format: username => `https://www.instagram.com/${username}`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
label: 'snapchat',
|
|
||||||
pattern: 'http(s)\\://(*)snapchat.com/add/:username(/)(?*)',
|
|
||||||
format: username => `https://www.snapchat.com/add/${username}`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
label: 'tumblr',
|
|
||||||
pattern: 'http(s)\\://:username.tumblr.com(*)',
|
|
||||||
format: username => `https://${username}.tumblr.com`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
label: 'onlyfans',
|
|
||||||
pattern: 'http(s)\\://(*)onlyfans.com/:username(/)(?*)',
|
|
||||||
format: username => `https://www.onlyfans.com/${username}`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
label: 'fancentro',
|
|
||||||
pattern: 'http(s)\\://(*)fancentro.com/:username(/)(?*)',
|
|
||||||
format: username => `https://www.fancentro.com/${username}`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
label: 'modelhub',
|
|
||||||
pattern: 'http(s)\\://(*)modelhub.com/:username(/)(?*)',
|
|
||||||
format: username => `https://www.modelhub.com/${username}`,
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const match = platforms.reduce((acc, platform) => {
|
|
||||||
if (acc) return acc;
|
|
||||||
|
|
||||||
const patternMatch = new UrlPattern(platform.pattern).match(url);
|
|
||||||
|
|
||||||
if (patternMatch) {
|
|
||||||
return {
|
|
||||||
platform: platform.label,
|
|
||||||
original: url,
|
|
||||||
username: patternMatch.username,
|
|
||||||
url: platform.format ? platform.format(patternMatch.username) : url,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}, null) || { url };
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url: match.url,
|
...actorAcc,
|
||||||
platform: match.platform,
|
[actorSlug]: actorAcc[actorSlug]
|
||||||
actor_id: actorId,
|
? actorAcc[actorSlug].concat(release.id)
|
||||||
|
: [release.id],
|
||||||
};
|
};
|
||||||
}
|
}, acc),
|
||||||
|
{},
|
||||||
|
);
|
||||||
|
|
||||||
async function curateSocialEntries(urls, actorId) {
|
console.log(releaseIdsByActor);
|
||||||
if (!urls) {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
const existingSocialLinks = await knex('actors_social').where('actor_id', actorId);
|
|
||||||
|
|
||||||
return urls.reduce((acc, url) => {
|
|
||||||
const socialEntry = curateSocialEntry(url, actorId);
|
|
||||||
|
|
||||||
if (acc.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase()) || existingSocialLinks.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase())) {
|
|
||||||
// prevent duplicates
|
|
||||||
return acc;
|
|
||||||
}
|
|
||||||
|
|
||||||
return [...acc, socialEntry];
|
|
||||||
}, []);
|
|
||||||
}
|
|
||||||
|
|
||||||
async function fetchActors(queryObject, limit = 100) {
|
|
||||||
const releases = await knex('actors')
|
|
||||||
.select(
|
|
||||||
'actors.*',
|
|
||||||
'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name', 'birth_countries.alias as birth_country_alias',
|
|
||||||
'residence_countries.alpha2 as residence_country_alpha2', 'residence_countries.name as residence_country_name', 'residence_countries.alias as residence_country_alias',
|
|
||||||
)
|
|
||||||
.leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2')
|
|
||||||
.leftJoin('countries as residence_countries', 'actors.residence_country_alpha2', 'residence_countries.alpha2')
|
|
||||||
.orderBy(['actors.name', 'actors.gender'])
|
|
||||||
.where(builder => whereOr(queryObject, 'actors', builder))
|
|
||||||
.limit(limit);
|
|
||||||
|
|
||||||
return curateActors(releases);
|
|
||||||
}
|
|
||||||
|
|
||||||
async function storeSocialLinks(urls, actorId) {
|
|
||||||
const curatedSocialEntries = await curateSocialEntries(urls, actorId);
|
|
||||||
|
|
||||||
await knex('actors_social').insert(curatedSocialEntries);
|
|
||||||
}
|
|
||||||
|
|
||||||
async function storeAvatars(avatars, actorId) {
|
|
||||||
if (!avatars || avatars.length === 0) {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
const avatarsBySource = await storeMedia(avatars, 'actor', 'avatar');
|
|
||||||
await associateMedia({ [actorId]: avatars }, avatarsBySource, 'actor', 'photo', 'avatar');
|
|
||||||
|
|
||||||
return avatarsBySource;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function storeActor(actor, scraped = false, scrapeSuccess = false) {
|
|
||||||
const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess);
|
|
||||||
|
|
||||||
const [actorEntry] = await knex('actors')
|
|
||||||
.insert(curatedActor)
|
|
||||||
.returning('*');
|
|
||||||
|
|
||||||
await storeSocialLinks(actor.social, actorEntry.id);
|
|
||||||
|
|
||||||
if (actor.avatars) {
|
|
||||||
await storeAvatars(actor.avatars, actorEntry.id);
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info(`Added new entry for actor '${actor.name}'`);
|
|
||||||
|
|
||||||
return actorEntry;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function updateActor(actor, scraped = false, scrapeSuccess = false) {
|
|
||||||
const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess);
|
|
||||||
|
|
||||||
const [actorEntry] = await knex('actors')
|
|
||||||
.where({ id: actor.id })
|
|
||||||
.update(curatedActor)
|
|
||||||
.returning('*');
|
|
||||||
|
|
||||||
await storeSocialLinks(actor.social, actor.id);
|
|
||||||
|
|
||||||
logger.info(`Updated entry for actor '${actor.name}'`);
|
|
||||||
|
|
||||||
return actorEntry;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function mergeProfiles(profiles, actor) {
|
|
||||||
if (profiles.filter(Boolean).length === 0) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const mergedProfile = profiles.reduce((prevProfile, profile) => {
|
|
||||||
if (profile === null) {
|
|
||||||
return prevProfile;
|
|
||||||
}
|
|
||||||
|
|
||||||
const accProfile = {
|
|
||||||
id: actor ? actor.id : null,
|
|
||||||
name: actor ? actor.name : (prevProfile.name || profile.name),
|
|
||||||
description: prevProfile.description || profile.description,
|
|
||||||
gender: prevProfile.gender || profile.gender,
|
|
||||||
birthdate: !prevProfile.birthdate || Number.isNaN(Number(prevProfile.birthdate)) ? profile.birthdate : prevProfile.birthdate,
|
|
||||||
birthPlace: prevProfile.birthPlace || profile.birthPlace,
|
|
||||||
residencePlace: prevProfile.residencePlace || profile.residencePlace,
|
|
||||||
nationality: prevProfile.nationality || profile.nationality, // used to derive country when not available
|
|
||||||
ethnicity: prevProfile.ethnicity || profile.ethnicity,
|
|
||||||
bust: prevProfile.bust || (/\d+\w+/.test(profile.bust) ? profile.bust : null),
|
|
||||||
waist: prevProfile.waist || profile.waist,
|
|
||||||
hip: prevProfile.hip || profile.hip,
|
|
||||||
naturalBoobs: prevProfile.naturalBoobs === undefined ? profile.naturalBoobs : prevProfile.naturalBoobs,
|
|
||||||
height: prevProfile.height || profile.height,
|
|
||||||
weight: prevProfile.weight || profile.weight,
|
|
||||||
hair: prevProfile.hair || profile.hair,
|
|
||||||
eyes: prevProfile.eyes || profile.eyes,
|
|
||||||
hasPiercings: prevProfile.hasPiercings === undefined ? profile.hasPiercings : prevProfile.hasPiercings,
|
|
||||||
hasTattoos: prevProfile.hasTattoos === undefined ? profile.hasTattoos : prevProfile.hasTattoos,
|
|
||||||
piercings: prevProfile.piercings || profile.piercings,
|
|
||||||
tattoos: prevProfile.tattoos || profile.tattoos,
|
|
||||||
social: prevProfile.social.concat(profile.social || []),
|
|
||||||
releases: prevProfile.releases.concat(profile.releases ? profile.releases : []), // don't flatten fallbacks
|
|
||||||
};
|
|
||||||
|
|
||||||
if (profile.avatar) {
|
|
||||||
const avatar = Array.isArray(profile.avatar)
|
|
||||||
? profile.avatar.map(avatarX => ({
|
|
||||||
src: avatarX.src || avatarX,
|
|
||||||
scraper: profile.scraper,
|
|
||||||
copyright: avatarX.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
|
|
||||||
}))
|
|
||||||
: {
|
|
||||||
src: profile.avatar.src || profile.avatar,
|
|
||||||
scraper: profile.scraper,
|
|
||||||
copyright: profile.avatar.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
|
|
||||||
};
|
|
||||||
|
|
||||||
accProfile.avatars = prevProfile.avatars.concat([avatar]); // don't flatten fallbacks
|
|
||||||
} else {
|
|
||||||
accProfile.avatars = prevProfile.avatars;
|
|
||||||
}
|
|
||||||
|
|
||||||
return accProfile;
|
|
||||||
}, {
|
|
||||||
social: [],
|
|
||||||
avatars: [],
|
|
||||||
releases: [],
|
|
||||||
});
|
|
||||||
|
|
||||||
const [birthPlace, residencePlace] = await Promise.all([
|
|
||||||
resolvePlace(mergedProfile.birthPlace),
|
|
||||||
resolvePlace(mergedProfile.residencePlace),
|
|
||||||
]);
|
|
||||||
|
|
||||||
mergedProfile.birthPlace = birthPlace;
|
|
||||||
mergedProfile.residencePlace = residencePlace;
|
|
||||||
|
|
||||||
if (!mergedProfile.birthPlace && mergedProfile.nationality) {
|
|
||||||
const country = await knex('countries')
|
|
||||||
.where('nationality', 'ilike', `%${mergedProfile.nationality}%`)
|
|
||||||
.orderBy('priority', 'desc')
|
|
||||||
.first();
|
|
||||||
|
|
||||||
mergedProfile.birthPlace = {
|
|
||||||
country: country.alpha2,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return mergedProfile;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function scrapeProfiles(sources, actorName, actorEntry, sitesBySlug) {
|
|
||||||
return Promise.map(sources, async (source) => {
|
|
||||||
// const [scraperSlug, scraper] = source;
|
|
||||||
const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] }));
|
|
||||||
|
|
||||||
try {
|
|
||||||
return await profileScrapers.reduce(async (outcome, { scraper, scraperSlug }) => outcome.catch(async () => {
|
|
||||||
if (!scraper) {
|
|
||||||
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
|
||||||
throw Object.assign(new Error(`No profile scraper available for ${scraperSlug}`));
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.verbose(`Searching '${actorName}' on ${scraperSlug}`);
|
|
||||||
|
|
||||||
const site = sitesBySlug[scraperSlug] || null;
|
|
||||||
const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, site, include);
|
|
||||||
|
|
||||||
if (profile) {
|
|
||||||
logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`);
|
|
||||||
|
|
||||||
return {
|
|
||||||
...profile,
|
|
||||||
name: actorName,
|
|
||||||
scraper: scraperSlug,
|
|
||||||
site,
|
|
||||||
releases: profile.releases?.map(release => (typeof release === 'string'
|
|
||||||
? { url: release, site }
|
|
||||||
: { ...release, site: release.site || site }
|
|
||||||
)),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.verbose(`No profile for '${actorName}' available on ${scraperSlug}`);
|
|
||||||
throw Object.assign(new Error(`Profile for ${actorName} not available on ${scraperSlug}`), { warn: false });
|
|
||||||
}), Promise.reject(new Error()));
|
|
||||||
} catch (error) {
|
|
||||||
if (error.warn !== false) {
|
|
||||||
logger.warn(`Error in scraper ${source}: ${error.message}`);
|
|
||||||
// logger.error(error.stack);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
async function scrapeActors(actorNames) {
|
|
||||||
return Promise.map(actorNames || argv.actors, async (actorName) => {
|
|
||||||
try {
|
|
||||||
const actorSlug = slugify(actorName);
|
|
||||||
const actorEntry = await knex('actors').where({ slug: actorSlug }).first();
|
|
||||||
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
|
|
||||||
|
|
||||||
const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested
|
|
||||||
|
|
||||||
const [siteEntries, networkEntries] = await Promise.all([
|
|
||||||
knex('sites')
|
|
||||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
|
||||||
.select(
|
|
||||||
'sites.*',
|
|
||||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
|
||||||
)
|
|
||||||
.whereIn('sites.slug', finalSources.flat()),
|
|
||||||
knex('networks').select('*').whereIn('slug', finalSources.flat()),
|
|
||||||
]);
|
|
||||||
|
|
||||||
const sites = await curateSites(siteEntries, true);
|
|
||||||
const networks = networkEntries.map(network => ({ ...network, isFallback: true }));
|
|
||||||
const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
|
|
||||||
|
|
||||||
const profiles = await scrapeProfiles(sources, actorName, actorEntry, sitesBySlug);
|
|
||||||
const profile = await mergeProfiles(profiles, actorEntry);
|
|
||||||
|
|
||||||
if (profile === null) {
|
|
||||||
logger.warn(`Could not find profile for actor '${actorName}'`);
|
|
||||||
|
|
||||||
if (argv.save && !actorEntry) {
|
|
||||||
await storeActor({ name: actorName }, false, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv.inspect) {
|
|
||||||
console.log(profile);
|
|
||||||
logger.info(`Found ${profile.releases.length} releases for ${actorName}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv.save) {
|
|
||||||
if (actorEntry && profile) {
|
|
||||||
await Promise.all([
|
|
||||||
updateActor(profile, true, true),
|
|
||||||
storeAvatars(profile.avatars, actorEntry.id),
|
|
||||||
]);
|
|
||||||
|
|
||||||
return profile;
|
|
||||||
}
|
|
||||||
|
|
||||||
await storeActor(profile, true, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
return profile;
|
|
||||||
} catch (error) {
|
|
||||||
console.log(error);
|
|
||||||
logger.warn(`${actorName}: ${error}`);
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}, {
|
|
||||||
concurrency: 3,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
async function scrapeBasicActors() {
|
|
||||||
const basicActors = await knex('actors').where('scraped_at', null);
|
|
||||||
|
|
||||||
return scrapeActors(basicActors.map(actor => actor.name));
|
|
||||||
}
|
|
||||||
|
|
||||||
async function associateActors(mappedActors, releases) {
|
|
||||||
const [existingActorEntries, existingAssociationEntries] = await Promise.all([
|
|
||||||
knex('actors')
|
|
||||||
.whereIn('name', Object.values(mappedActors).map(actor => actor.name))
|
|
||||||
.orWhereIn('slug', Object.keys(mappedActors)),
|
|
||||||
knex('releases_actors').whereIn('release_id', releases.map(release => release.id)),
|
|
||||||
]);
|
|
||||||
|
|
||||||
const associations = await Promise.map(Object.entries(mappedActors), async ([actorSlug, actor]) => {
|
|
||||||
try {
|
|
||||||
const actorEntry = existingActorEntries.find(actorX => actorX.slug === actorSlug)
|
|
||||||
|| await storeActor(actor);
|
|
||||||
|
|
||||||
// if a scene
|
|
||||||
return Array.from(actor.releaseIds)
|
|
||||||
.map(releaseId => ({
|
|
||||||
release_id: releaseId,
|
|
||||||
actor_id: actorEntry.id,
|
|
||||||
}))
|
|
||||||
.filter(association => !existingAssociationEntries
|
|
||||||
// remove associations already in database
|
|
||||||
.some(associationEntry => associationEntry.actor_id === association.actor_id
|
|
||||||
&& associationEntry.release_id === association.release_id));
|
|
||||||
} catch (error) {
|
|
||||||
logger.error(actor.name, error);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
await knex('releases_actors').insert(associations.filter(association => association).flat());
|
|
||||||
|
|
||||||
// basic actor scraping is failure prone, don't run together with actor association
|
|
||||||
// await scrapebasicactors(),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
associateActors,
|
storeReleaseActors,
|
||||||
fetchActors,
|
|
||||||
scrapeActors,
|
|
||||||
scrapeBasicActors,
|
|
||||||
};
|
};
|
||||||
|
|
15
src/app.js
|
@ -5,7 +5,10 @@ const argv = require('./argv');
|
||||||
const initServer = require('./web/server');
|
const initServer = require('./web/server');
|
||||||
|
|
||||||
const knex = require('./knex');
|
const knex = require('./knex');
|
||||||
const fetchUpdates = require('./fetch-updates');
|
const fetchUpdates = require('./updates');
|
||||||
|
const fetchDeep = require('./deep');
|
||||||
|
const { storeReleases } = require('./store-releases');
|
||||||
|
// const { storeReleaseActors } = require('./actors');
|
||||||
|
|
||||||
async function init() {
|
async function init() {
|
||||||
if (argv.server) {
|
if (argv.server) {
|
||||||
|
@ -13,7 +16,15 @@ async function init() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
await fetchUpdates();
|
const updateBaseReleases = (argv.scrape || argv.sites || argv.networks) && await fetchUpdates();
|
||||||
|
|
||||||
|
const updateDeepReleases = updateBaseReleases && await fetchDeep(updateBaseReleases);
|
||||||
|
const argvDeepReleases = argv.scenes && await fetchDeep(argv.scenes);
|
||||||
|
|
||||||
|
await storeReleases([...(updateDeepReleases || []), ...(argvDeepReleases || [])]);
|
||||||
|
|
||||||
|
// await storeReleaseActors(updateReleases);
|
||||||
|
|
||||||
knex.destroy();
|
knex.destroy();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,145 @@
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
const argv = require('./argv');
|
||||||
|
const logger = require('./logger')(__filename);
|
||||||
|
const knex = require('./knex');
|
||||||
|
const scrapers = require('./scrapers/scrapers');
|
||||||
|
const { curateSites } = require('./sites');
|
||||||
|
const { curateNetworks } = require('./networks');
|
||||||
|
|
||||||
|
function urlToSiteSlug(url) {
|
||||||
|
try {
|
||||||
|
const slug = new URL(url)
|
||||||
|
.hostname
|
||||||
|
.match(/([\w-]+)\.\w+$/)?.[1];
|
||||||
|
|
||||||
|
return slug;
|
||||||
|
} catch (error) {
|
||||||
|
logger.warn(`Failed to derive site slug from '${url}': ${error.message}`);
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function findSites(baseReleases) {
|
||||||
|
const baseReleasesWithoutSite = baseReleases.filter(release => release.url && !release.site);
|
||||||
|
|
||||||
|
const siteSlugs = Array.from(new Set(
|
||||||
|
baseReleasesWithoutSite
|
||||||
|
.map(baseRelease => urlToSiteSlug(baseRelease.url))
|
||||||
|
.filter(Boolean),
|
||||||
|
));
|
||||||
|
|
||||||
|
const siteEntries = await knex('sites').whereIn('slug', siteSlugs);
|
||||||
|
const networkEntries = await knex('networks').whereIn('slug', siteSlugs);
|
||||||
|
|
||||||
|
const sites = await curateSites(siteEntries, true, false);
|
||||||
|
const networks = await curateNetworks(networkEntries, true, false, false);
|
||||||
|
const markedNetworks = networks.map(network => ({ ...network, isFallback: true }));
|
||||||
|
|
||||||
|
const sitesBySlug = []
|
||||||
|
.concat(sites, markedNetworks)
|
||||||
|
.reduce((accSites, site) => ({ ...accSites, [site.slug]: site }), {});
|
||||||
|
|
||||||
|
return sitesBySlug;
|
||||||
|
}
|
||||||
|
|
||||||
|
function toBaseReleases(baseReleasesOrUrls) {
|
||||||
|
return baseReleasesOrUrls
|
||||||
|
.map((baseReleaseOrUrl) => {
|
||||||
|
if (baseReleaseOrUrl.url) {
|
||||||
|
// base release with URL
|
||||||
|
return {
|
||||||
|
...baseReleaseOrUrl,
|
||||||
|
deep: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/^http/.test(baseReleaseOrUrl)) {
|
||||||
|
// URL
|
||||||
|
return {
|
||||||
|
url: baseReleaseOrUrl,
|
||||||
|
deep: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof baseReleaseOrUrl === 'object' && !Array.isArray(baseReleaseOrUrl)) {
|
||||||
|
// base release without URL, prepare for passthrough
|
||||||
|
return {
|
||||||
|
...baseReleaseOrUrl,
|
||||||
|
deep: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.warn(`Malformed base release, discarding '${baseReleaseOrUrl}'`);
|
||||||
|
return null;
|
||||||
|
})
|
||||||
|
.filter(Boolean);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapeRelease(baseRelease, sites, type = 'scene') {
|
||||||
|
const site = baseRelease.site || sites[urlToSiteSlug(baseRelease.url)];
|
||||||
|
|
||||||
|
if (!site) {
|
||||||
|
logger.warn(`No site available for ${baseRelease.url}`);
|
||||||
|
return baseRelease;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((!baseRelease.url && !baseRelease.path) || !argv.deep) {
|
||||||
|
return {
|
||||||
|
...baseRelease,
|
||||||
|
site,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const scraper = scrapers.releases[site.slug];
|
||||||
|
|
||||||
|
if (!scraper) {
|
||||||
|
logger.warn(`Could not find scraper for ${baseRelease.url}`);
|
||||||
|
return baseRelease;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) {
|
||||||
|
logger.warn(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
|
||||||
|
return baseRelease;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const scrapedRelease = type === 'scene'
|
||||||
|
? await scraper.fetchScene(baseRelease.url, site, baseRelease)
|
||||||
|
: await scraper.fetchMovie(baseRelease.url, site, baseRelease);
|
||||||
|
|
||||||
|
const mergedRelease = {
|
||||||
|
...baseRelease,
|
||||||
|
...scrapedRelease,
|
||||||
|
deep: !!scrapedRelease,
|
||||||
|
site,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (scrapedRelease && baseRelease?.tags) {
|
||||||
|
mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(mergedRelease);
|
||||||
|
|
||||||
|
return mergedRelease;
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`);
|
||||||
|
return baseRelease;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapeReleases(baseReleases, sites) {
|
||||||
|
return Promise.all(baseReleases.map(baseRelease => scrapeRelease(baseRelease, sites)));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchReleases(baseReleasesOrUrls) {
|
||||||
|
const baseReleases = toBaseReleases(baseReleasesOrUrls);
|
||||||
|
const sites = await findSites(baseReleases);
|
||||||
|
|
||||||
|
const deepReleases = await scrapeReleases(baseReleases, sites);
|
||||||
|
|
||||||
|
return deepReleases;
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = fetchReleases;
|
|
@ -4,29 +4,33 @@ const knex = require('./knex');
|
||||||
const whereOr = require('./utils/where-or');
|
const whereOr = require('./utils/where-or');
|
||||||
const { fetchSites } = require('./sites');
|
const { fetchSites } = require('./sites');
|
||||||
|
|
||||||
async function curateNetwork(network, includeParameters = false) {
|
async function curateNetwork(network, includeParameters = false, includeSites = true, includeStudios = false) {
|
||||||
const [sites, studios] = await Promise.all([
|
const curatedNetwork = {
|
||||||
fetchSites({ network_id: network.id }),
|
|
||||||
knex('studios')
|
|
||||||
.where({ network_id: network.id }),
|
|
||||||
]);
|
|
||||||
|
|
||||||
return {
|
|
||||||
id: network.id,
|
id: network.id,
|
||||||
name: network.name,
|
name: network.name,
|
||||||
url: network.url,
|
url: network.url,
|
||||||
description: network.description,
|
description: network.description,
|
||||||
slug: network.slug,
|
slug: network.slug,
|
||||||
sites,
|
|
||||||
parameters: includeParameters ? network.parameters : null,
|
parameters: includeParameters ? network.parameters : null,
|
||||||
studios: studios.map(studio => ({
|
};
|
||||||
|
|
||||||
|
if (includeSites) {
|
||||||
|
curatedNetwork.sites = await fetchSites({ network_id: network.id });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (includeStudios) {
|
||||||
|
const studios = await knex('studios').where({ network_id: network.id });
|
||||||
|
|
||||||
|
curatedNetwork.studios = studios.map(studio => ({
|
||||||
id: studio.id,
|
id: studio.id,
|
||||||
name: studio.name,
|
name: studio.name,
|
||||||
url: studio.url,
|
url: studio.url,
|
||||||
description: studio.description,
|
description: studio.description,
|
||||||
slug: studio.slug,
|
slug: studio.slug,
|
||||||
})),
|
}));
|
||||||
};
|
}
|
||||||
|
|
||||||
|
return curatedNetwork;
|
||||||
}
|
}
|
||||||
|
|
||||||
function curateNetworks(releases) {
|
function curateNetworks(releases) {
|
||||||
|
@ -69,6 +73,8 @@ async function fetchNetworksFromReleases() {
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
curateNetwork,
|
||||||
|
curateNetworks,
|
||||||
fetchNetworks,
|
fetchNetworks,
|
||||||
fetchNetworksFromReleases,
|
fetchNetworksFromReleases,
|
||||||
findNetworkByUrl,
|
findNetworkByUrl,
|
||||||
|
|
|
@ -15,7 +15,7 @@ const {
|
||||||
storeMedia,
|
storeMedia,
|
||||||
associateMedia,
|
associateMedia,
|
||||||
} = require('./media');
|
} = require('./media');
|
||||||
const { fetchSites, findSiteByUrl } = require('./sites');
|
const { fetchSites } = require('./sites');
|
||||||
const slugify = require('./utils/slugify');
|
const slugify = require('./utils/slugify');
|
||||||
const capitalize = require('./utils/capitalize');
|
const capitalize = require('./utils/capitalize');
|
||||||
|
|
||||||
|
@ -174,16 +174,7 @@ async function attachChannelSite(release) {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
throw new Error(`Unable to match channel '${release.channel.slug || release.channel}' from generic URL: ${release.url}`);
|
||||||
const urlSite = await findSiteByUrl(release.channel.url || release.channel);
|
|
||||||
|
|
||||||
return {
|
|
||||||
...release,
|
|
||||||
site: urlSite,
|
|
||||||
};
|
|
||||||
} catch (error) {
|
|
||||||
throw new Error(`Unable to derive channel site from generic URL: ${release.url}`);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function attachStudio(release) {
|
async function attachStudio(release) {
|
||||||
|
|
|
@ -90,7 +90,7 @@ async function scrapeProfile({ qu }, site, withScenes) {
|
||||||
|
|
||||||
const bio = qu.all('.stats li', true).reduce((acc, row) => {
|
const bio = qu.all('.stats li', true).reduce((acc, row) => {
|
||||||
const [key, value] = row.split(':');
|
const [key, value] = row.split(':');
|
||||||
return { ...acc, [slugify(key, { delimiter: '_' })]: value.trim() };
|
return { ...acc, [slugify(key, '_')]: value.trim() };
|
||||||
}, {});
|
}, {});
|
||||||
|
|
||||||
if (bio.height) profile.height = feetInchesToCm(bio.height);
|
if (bio.height) profile.height = feetInchesToCm(bio.height);
|
||||||
|
@ -133,7 +133,7 @@ async function fetchScene(url, site) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchProfile(actorName, scraperSlug, site, include) {
|
async function fetchProfile(actorName, scraperSlug, site, include) {
|
||||||
const actorSlugA = slugify(actorName, { delimiter: '' });
|
const actorSlugA = slugify(actorName, '');
|
||||||
const actorSlugB = slugify(actorName);
|
const actorSlugB = slugify(actorName);
|
||||||
|
|
||||||
const resA = await get(`${site.url}/models/${actorSlugA}.html`);
|
const resA = await get(`${site.url}/models/${actorSlugA}.html`);
|
||||||
|
|
|
@ -43,7 +43,7 @@ function scrapeAll(html, site, upcoming) {
|
||||||
const poster = `https:${$(element).find('.card-main-img').attr('data-src')}`;
|
const poster = `https:${$(element).find('.card-main-img').attr('data-src')}`;
|
||||||
const photos = $(element).find('.card-overlay .image-under').map((photoIndex, photoElement) => `https:${$(photoElement).attr('data-src')}`).toArray();
|
const photos = $(element).find('.card-overlay .image-under').map((photoIndex, photoElement) => `https:${$(photoElement).attr('data-src')}`).toArray();
|
||||||
|
|
||||||
const channel = slugify($(element).find('.collection').attr('title'), { delimiter: '' });
|
const channel = slugify($(element).find('.collection').attr('title'), '');
|
||||||
|
|
||||||
return acc.concat({
|
return acc.concat({
|
||||||
url,
|
url,
|
||||||
|
|
|
@ -61,7 +61,7 @@ function scrapeProfile({ q, qa, qtx }) {
|
||||||
|
|
||||||
const keys = qa('.model-descr_line:not(.model-descr_rait) p.text span', true);
|
const keys = qa('.model-descr_line:not(.model-descr_rait) p.text span', true);
|
||||||
const values = qa('.model-descr_line:not(.model-descr_rait) p.text').map(el => qtx(el));
|
const values = qa('.model-descr_line:not(.model-descr_rait) p.text').map(el => qtx(el));
|
||||||
const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, { delimiter: '_' })]: values[index] }), {});
|
const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, '_')]: values[index] }), {});
|
||||||
|
|
||||||
if (bio.height) profile.height = Number(bio.height.match(/\((\d+)cm\)/)[1]);
|
if (bio.height) profile.height = Number(bio.height.match(/\((\d+)cm\)/)[1]);
|
||||||
if (bio.weight) profile.weight = Number(bio.weight.match(/\((\d+)kg\)/)[1]);
|
if (bio.weight) profile.weight = Number(bio.weight.match(/\((\d+)kg\)/)[1]);
|
||||||
|
@ -122,7 +122,7 @@ async function fetchScene(url, site, release) {
|
||||||
|
|
||||||
async function fetchProfile(actorName, scraperSlug) {
|
async function fetchProfile(actorName, scraperSlug) {
|
||||||
const actorSlug = slugify(actorName);
|
const actorSlug = slugify(actorName);
|
||||||
const actorSlug2 = slugify(actorName, { delimiter: '' });
|
const actorSlug2 = slugify(actorName, '');
|
||||||
|
|
||||||
const [url, url2] = ['cherrypimps', 'wildoncam'].includes(scraperSlug)
|
const [url, url2] = ['cherrypimps', 'wildoncam'].includes(scraperSlug)
|
||||||
? [`https://${scraperSlug}.com/models/${actorSlug}.html`, `https://${scraperSlug}.com/models/${actorSlug2}.html`]
|
? [`https://${scraperSlug}.com/models/${actorSlug}.html`, `https://${scraperSlug}.com/models/${actorSlug2}.html`]
|
||||||
|
|
|
@ -74,7 +74,7 @@ async function fetchActorReleases(urls) {
|
||||||
async function scrapeProfile(html, _url, actorName) {
|
async function scrapeProfile(html, _url, actorName) {
|
||||||
const { qu } = ex(html);
|
const { qu } = ex(html);
|
||||||
|
|
||||||
const keys = qu.all('.about-title', true).map(key => slugify(key, { delimiter: '_' }));
|
const keys = qu.all('.about-title', true).map(key => slugify(key, '_'));
|
||||||
const values = qu.all('.about-info').map((el) => {
|
const values = qu.all('.about-info').map((el) => {
|
||||||
if (el.children.length > 0) {
|
if (el.children.length > 0) {
|
||||||
return Array.from(el.children, child => child.textContent.trim()).join(', ');
|
return Array.from(el.children, child => child.textContent.trim()).join(', ');
|
||||||
|
|
|
@ -79,7 +79,7 @@ async function fetchScene(url, site) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchProfile(actorName, scraperSlug) {
|
async function fetchProfile(actorName, scraperSlug) {
|
||||||
const actorSlug = slugify(actorName, { delimiter: '' });
|
const actorSlug = slugify(actorName, '');
|
||||||
const url = scraperSlug === 'povperverts'
|
const url = scraperSlug === 'povperverts'
|
||||||
? `https://povperverts.net/models/${actorSlug}.html`
|
? `https://povperverts.net/models/${actorSlug}.html`
|
||||||
: `https://${scraperSlug}.com/models/${actorSlug}.html`;
|
: `https://${scraperSlug}.com/models/${actorSlug}.html`;
|
||||||
|
|
|
@ -233,7 +233,7 @@ async function scrapeScene(html, url, site, baseRelease, mobileHtml) {
|
||||||
release.tags = hasTrans ? [...rawTags, 'transsexual'] : rawTags;
|
release.tags = hasTrans ? [...rawTags, 'transsexual'] : rawTags;
|
||||||
|
|
||||||
const channel = data?.productionCompany?.name || $('.studioLink a, .siteLink a').attr('title')?.trim() || $('.siteNameSpan').text()?.trim().toLowerCase().replace('.com', '');
|
const channel = data?.productionCompany?.name || $('.studioLink a, .siteLink a').attr('title')?.trim() || $('.siteNameSpan').text()?.trim().toLowerCase().replace('.com', '');
|
||||||
if (channel) release.channel = slugify(channel, { delimiter: '' });
|
if (channel) release.channel = slugify(channel, '');
|
||||||
|
|
||||||
if (videoData.picPreview && new URL(videoData.picPreview).pathname.length > 1) release.poster = videoData.picPreview; // sometimes links to just https://images02-fame.gammacdn.com/
|
if (videoData.picPreview && new URL(videoData.picPreview).pathname.length > 1) release.poster = videoData.picPreview; // sometimes links to just https://images02-fame.gammacdn.com/
|
||||||
|
|
||||||
|
|
|
@ -193,7 +193,7 @@ function scrapeSceneT1({ html, qu }, site, url, baseRelease, channelRegExp) {
|
||||||
if (channel) {
|
if (channel) {
|
||||||
release.channel = {
|
release.channel = {
|
||||||
force: true,
|
force: true,
|
||||||
slug: slugify(channel, { delimiter: '' }),
|
slug: slugify(channel, ''),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -239,7 +239,7 @@ function scrapeProfile({ el, qu }, site) {
|
||||||
|
|
||||||
return {
|
return {
|
||||||
...acc,
|
...acc,
|
||||||
[slugify(key, { delimiter: '_' })]: value.trim(),
|
[slugify(key, '_')]: value.trim(),
|
||||||
};
|
};
|
||||||
}, {});
|
}, {});
|
||||||
|
|
||||||
|
@ -272,7 +272,7 @@ function scrapeProfileT1({ el, qu }, site) {
|
||||||
|
|
||||||
return {
|
return {
|
||||||
...acc,
|
...acc,
|
||||||
[slugify(key, { delimiter: '_' })]: value.trim(),
|
[slugify(key, '_')]: value.trim(),
|
||||||
};
|
};
|
||||||
}, {});
|
}, {});
|
||||||
|
|
||||||
|
@ -308,7 +308,7 @@ function scrapeProfileTour({ el, qu }, site) {
|
||||||
|
|
||||||
return {
|
return {
|
||||||
...acc,
|
...acc,
|
||||||
[slugify(key, { delimiter: '_' })]: value.trim(),
|
[slugify(key, '_')]: value.trim(),
|
||||||
};
|
};
|
||||||
}, {});
|
}, {});
|
||||||
|
|
||||||
|
@ -382,7 +382,7 @@ async function fetchScene(url, site, baseRelease, beforeFetchLatest) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchProfile(actorName, scraperSlug, site) {
|
async function fetchProfile(actorName, scraperSlug, site) {
|
||||||
const actorSlugA = slugify(actorName, { delimiter: '' });
|
const actorSlugA = slugify(actorName, '');
|
||||||
const actorSlugB = slugify(actorName);
|
const actorSlugB = slugify(actorName);
|
||||||
|
|
||||||
const t1 = site.parameters?.t1 ? 't1/' : '';
|
const t1 = site.parameters?.t1 ? 't1/' : '';
|
||||||
|
|
|
@ -384,8 +384,8 @@ async function fetchMovie(url, site) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchProfile(actorName) {
|
async function fetchProfile(actorName) {
|
||||||
const actorSlugA = slugify(actorName, { delimiter: '-' });
|
const actorSlugA = slugify(actorName, '-');
|
||||||
const actorSlugB = slugify(actorName, { delimiter: '' });
|
const actorSlugB = slugify(actorName, '');
|
||||||
|
|
||||||
const urlA = `https://julesjordan.com/trial/models/${actorSlugA}.html`;
|
const urlA = `https://julesjordan.com/trial/models/${actorSlugA}.html`;
|
||||||
const urlB = `https://julesjordan.com/trial/models/${actorSlugB}.html`;
|
const urlB = `https://julesjordan.com/trial/models/${actorSlugB}.html`;
|
||||||
|
|
|
@ -98,7 +98,7 @@ function scrapeScene(data, url, _site, networkName) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const siteName = data.collections[0]?.name || data.brand;
|
const siteName = data.collections[0]?.name || data.brand;
|
||||||
release.channel = slugify(siteName, { delimiter: '' });
|
release.channel = slugify(siteName, '');
|
||||||
|
|
||||||
release.url = url || `https://www.${networkName || data.brand}.com/scene/${entryId}/`;
|
release.url = url || `https://www.${networkName || data.brand}.com/scene/${entryId}/`;
|
||||||
|
|
||||||
|
|
|
@ -94,7 +94,7 @@ function scrapeProfile({ qu }, _actorName, origin) {
|
||||||
const keys = qu.all('.model-profile h5', true);
|
const keys = qu.all('.model-profile h5', true);
|
||||||
const values = qu.all('.model-profile h5 + p', true);
|
const values = qu.all('.model-profile h5 + p', true);
|
||||||
|
|
||||||
const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, { delimiter: '_' })]: values[index] }), {});
|
const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, '_')]: values[index] }), {});
|
||||||
|
|
||||||
profile.age = Number(bio.age);
|
profile.age = Number(bio.age);
|
||||||
profile.description = qu.q('.model-bio', true);
|
profile.description = qu.q('.model-bio', true);
|
||||||
|
|
|
@ -95,7 +95,7 @@ async function scrapeScene(html, url, site) {
|
||||||
release.movie = $('a[data-track="FULL MOVIE"]').attr('href');
|
release.movie = $('a[data-track="FULL MOVIE"]').attr('href');
|
||||||
|
|
||||||
const siteElement = $('.content-wrapper .logos-sites a');
|
const siteElement = $('.content-wrapper .logos-sites a');
|
||||||
if (siteElement) release.channel = slugify(siteElement.text(), { delimiter: '' });
|
if (siteElement) release.channel = slugify(siteElement.text(), '');
|
||||||
|
|
||||||
return release;
|
return release;
|
||||||
}
|
}
|
||||||
|
@ -108,7 +108,7 @@ function scrapeProfile({ html, q, qa, qtx }) {
|
||||||
const trimmedValue = value.trim();
|
const trimmedValue = value.trim();
|
||||||
|
|
||||||
if (trimmedValue.length === 0 || trimmedValue === '-') return acc;
|
if (trimmedValue.length === 0 || trimmedValue === '-') return acc;
|
||||||
return { ...acc, [slugify(key, { delimiter: '_' })]: trimmedValue };
|
return { ...acc, [slugify(key, '_')]: trimmedValue };
|
||||||
}, {});
|
}, {});
|
||||||
|
|
||||||
const description = q('.model-facts-long', true);
|
const description = q('.model-facts-long', true);
|
||||||
|
@ -176,7 +176,7 @@ async function fetchScene(url, site) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchProfile(actorName) {
|
async function fetchProfile(actorName) {
|
||||||
const actorSearchSlug = slugify(actorName, { delimiter: '+' });
|
const actorSearchSlug = slugify(actorName, '+');
|
||||||
const url = `https://www.private.com/search.php?query=${actorSearchSlug}`;
|
const url = `https://www.private.com/search.php?query=${actorSearchSlug}`;
|
||||||
const modelRes = await geta(url, '.model h3 a');
|
const modelRes = await geta(url, '.model h3 a');
|
||||||
|
|
||||||
|
|
|
@ -155,7 +155,7 @@ async function scrapeProfile(html, actorUrl, withReleases) {
|
||||||
|
|
||||||
const bio = qa('.stat').reduce((acc, el) => {
|
const bio = qa('.stat').reduce((acc, el) => {
|
||||||
const prop = q(el, '.label', true).slice(0, -1);
|
const prop = q(el, '.label', true).slice(0, -1);
|
||||||
const key = slugify(prop, { delimiter: '_' });
|
const key = slugify(prop, '_');
|
||||||
const value = q(el, '.value', true);
|
const value = q(el, '.value', true);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
22
src/sites.js
|
@ -7,19 +7,13 @@ const argv = require('./argv');
|
||||||
const knex = require('./knex');
|
const knex = require('./knex');
|
||||||
const whereOr = require('./utils/where-or');
|
const whereOr = require('./utils/where-or');
|
||||||
|
|
||||||
async function curateSite(site, includeParameters = false) {
|
async function curateSite(site, includeParameters = false, includeTags = true) {
|
||||||
const tags = await knex('sites_tags')
|
const curatedSite = {
|
||||||
.select('tags.*', 'sites_tags.inherit')
|
|
||||||
.where('site_id', site.id)
|
|
||||||
.join('tags', 'tags.id', 'sites_tags.tag_id');
|
|
||||||
|
|
||||||
return {
|
|
||||||
id: site.id,
|
id: site.id,
|
||||||
name: site.name,
|
name: site.name,
|
||||||
url: site.url,
|
url: site.url,
|
||||||
description: site.description,
|
description: site.description,
|
||||||
slug: site.slug,
|
slug: site.slug,
|
||||||
tags,
|
|
||||||
independent: !!site.parameters && site.parameters.independent,
|
independent: !!site.parameters && site.parameters.independent,
|
||||||
parameters: includeParameters ? site.parameters : null,
|
parameters: includeParameters ? site.parameters : null,
|
||||||
network: {
|
network: {
|
||||||
|
@ -31,6 +25,15 @@ async function curateSite(site, includeParameters = false) {
|
||||||
parameters: includeParameters ? site.network_parameters : null,
|
parameters: includeParameters ? site.network_parameters : null,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (includeTags) {
|
||||||
|
curatedSite.tags = await knex('sites_tags')
|
||||||
|
.select('tags.*', 'sites_tags.inherit')
|
||||||
|
.where('site_id', site.id)
|
||||||
|
.join('tags', 'tags.id', 'sites_tags.tag_id');
|
||||||
|
}
|
||||||
|
|
||||||
|
return curatedSite;
|
||||||
}
|
}
|
||||||
|
|
||||||
function curateSites(sites, includeParameters) {
|
function curateSites(sites, includeParameters) {
|
||||||
|
@ -78,7 +81,7 @@ async function findSiteByUrl(url) {
|
||||||
.first();
|
.first();
|
||||||
|
|
||||||
if (site) {
|
if (site) {
|
||||||
const curatedSite = curateSite(site, true);
|
const curatedSite = curateSite(site, true, false);
|
||||||
|
|
||||||
return curatedSite;
|
return curatedSite;
|
||||||
}
|
}
|
||||||
|
@ -182,6 +185,7 @@ async function fetchSitesFromReleases() {
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
curateSite,
|
||||||
curateSites,
|
curateSites,
|
||||||
fetchIncludedSites,
|
fetchIncludedSites,
|
||||||
fetchSites,
|
fetchSites,
|
||||||
|
|
|
@ -0,0 +1,71 @@
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
const config = require('config');
|
||||||
|
|
||||||
|
const knex = require('./knex');
|
||||||
|
const slugify = require('./utils/slugify');
|
||||||
|
|
||||||
|
function curateReleaseEntry(release, batchId, existingRelease) {
|
||||||
|
const slug = slugify(release.title, '-', {
|
||||||
|
encode: true,
|
||||||
|
limit: config.titleSlugLength,
|
||||||
|
});
|
||||||
|
|
||||||
|
const curatedRelease = {
|
||||||
|
title: release.title,
|
||||||
|
entry_id: release.entryId || null,
|
||||||
|
site_id: release.site.id,
|
||||||
|
shoot_id: release.shootId || null,
|
||||||
|
studio_id: release.studio?.id || null,
|
||||||
|
url: release.url,
|
||||||
|
date: release.date,
|
||||||
|
slug,
|
||||||
|
description: release.description,
|
||||||
|
duration: release.duration,
|
||||||
|
type: release.type,
|
||||||
|
// director: release.director,
|
||||||
|
// likes: release.rating && release.rating.likes,
|
||||||
|
// dislikes: release.rating && release.rating.dislikes,
|
||||||
|
// rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
|
||||||
|
deep: typeof release.deep === 'boolean' ? release.deep : false,
|
||||||
|
deep_url: release.deepUrl,
|
||||||
|
updated_batch_id: batchId,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!existingRelease) {
|
||||||
|
curatedRelease.created_batch_id = batchId;
|
||||||
|
}
|
||||||
|
|
||||||
|
return curatedRelease;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function attachSite(releases) {
|
||||||
|
const releasesWithoutSite = releases.filter(release => !release.site || release.site.isFallback);
|
||||||
|
|
||||||
|
// console.log(releases, releasesWithoutSite);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function extractUniqueReleases(releases) {
|
||||||
|
const duplicateReleaseEntries = await knex('releases')
|
||||||
|
.whereIn(['entry_id', 'site_id'], releases.map(release => [release.entryId, release.site.id]));
|
||||||
|
|
||||||
|
const duplicateReleaseEntryKeys = new Set(duplicateReleaseEntries.map(releaseEntry => `${releaseEntry.site_id}_${releaseEntry.entry_id}`));
|
||||||
|
const uniqueReleases = releases.filter(release => !duplicateReleaseEntryKeys.has(`${release.site.id}_${release.entryId}`));
|
||||||
|
|
||||||
|
return uniqueReleases;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function storeReleases(releases) {
|
||||||
|
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
|
||||||
|
|
||||||
|
const uniqueReleases = await extractUniqueReleases(releases);
|
||||||
|
const releasesWithSites = await attachSite(releases);
|
||||||
|
|
||||||
|
const curatedReleaseEntries = uniqueReleases.slice(0, 2).map(release => curateReleaseEntry(release, batchId));
|
||||||
|
|
||||||
|
await knex('releases').insert(curatedReleaseEntries);
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
storeReleases,
|
||||||
|
};
|
|
@ -83,7 +83,10 @@ async function scrapeLatestReleases(scraper, site, preData) {
|
||||||
const latestReleasesWithSite = latestReleases.map(release => ({ ...release, site: release.site || site })); // attach site release is assigned to when stored
|
const latestReleasesWithSite = latestReleases.map(release => ({ ...release, site: release.site || site })); // attach site release is assigned to when stored
|
||||||
const oldestReleaseOnPage = latestReleases.sort((releaseA, releaseB) => releaseB.date - releaseA.date).slice(-1)[0];
|
const oldestReleaseOnPage = latestReleases.sort((releaseA, releaseB) => releaseB.date - releaseA.date).slice(-1)[0];
|
||||||
|
|
||||||
const uniqueReleases = await extractUniqueReleases(latestReleasesWithSite, accReleases);
|
const uniqueReleases = argv.redownload
|
||||||
|
? latestReleasesWithSite
|
||||||
|
: await extractUniqueReleases(latestReleasesWithSite, accReleases);
|
||||||
|
|
||||||
const pageAccReleases = accReleases.concat(uniqueReleases);
|
const pageAccReleases = accReleases.concat(uniqueReleases);
|
||||||
|
|
||||||
logger.verbose(`Scraped '${site.name}' (${site.network.name}) page ${page}, found ${uniqueReleases.length} unique releases`);
|
logger.verbose(`Scraped '${site.name}' (${site.network.name}) page ${page}, found ${uniqueReleases.length} unique releases`);
|
||||||
|
@ -204,7 +207,9 @@ async function fetchUpdates() {
|
||||||
{ concurrency: 5 },
|
{ concurrency: 5 },
|
||||||
);
|
);
|
||||||
|
|
||||||
return scrapedNetworks;
|
const releases = scrapedNetworks.flat(2);
|
||||||
|
|
||||||
|
return releases;
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = fetchUpdates;
|
module.exports = fetchUpdates;
|
|
@ -1,13 +1,14 @@
|
||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
function slugify(string, {
|
function slugify(string, delimiter = '-', {
|
||||||
encode = false,
|
encode = false,
|
||||||
delimiter = '-',
|
|
||||||
limit = 1000,
|
limit = 1000,
|
||||||
} = {}) {
|
} = {}) {
|
||||||
const slugComponents = string.trim().toLowerCase().match(/\w+/g);
|
const slugComponents = string.trim().toLowerCase().match(/\w+/g);
|
||||||
|
|
||||||
if (!slugComponents) return '';
|
if (!slugComponents) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
const slug = slugComponents.reduce((acc, component, index) => {
|
const slug = slugComponents.reduce((acc, component, index) => {
|
||||||
const accSlug = `${acc}${index > 0 ? delimiter : ''}${component}`;
|
const accSlug = `${acc}${index > 0 ? delimiter : ''}${component}`;
|
||||||
|
|