Entity refactor. Facilitating channels without parent.
This commit is contained in:
@@ -1,539 +0,0 @@
|
||||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
const UrlPattern = require('url-pattern');
|
||||
const moment = require('moment');
|
||||
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const argv = require('./argv');
|
||||
const include = require('./utils/argv-include')(argv);
|
||||
const scrapers = require('./scrapers/scrapers');
|
||||
const whereOr = require('./utils/where-or');
|
||||
const resolvePlace = require('./utils/resolve-place');
|
||||
const slugify = require('./utils/slugify');
|
||||
const capitalize = require('./utils/capitalize');
|
||||
const { curateSites } = require('./sites');
|
||||
const { storeMedia, associateMedia } = require('./media');
|
||||
|
||||
async function curateActor(actor) {
|
||||
const [aliases, avatar, photos, social] = await Promise.all([
|
||||
knex('actors').where({ alias_for: actor.id }),
|
||||
knex('actors_avatars')
|
||||
.where('actor_id', actor.id)
|
||||
.join('media', 'media.id', 'actors_avatars.media_id')
|
||||
.first(),
|
||||
knex('actors_photos')
|
||||
.where('actor_id', actor.id)
|
||||
.join('media', 'media.id', 'actors_photos.media_id')
|
||||
.orderBy('index'),
|
||||
knex('actors_social')
|
||||
.where('actor_id', actor.id)
|
||||
.orderBy('platform', 'desc'),
|
||||
]);
|
||||
|
||||
const curatedActor = {
|
||||
id: actor.id,
|
||||
gender: actor.gender,
|
||||
name: actor.name,
|
||||
description: actor.description,
|
||||
birthdate: actor.birthdate && new Date(actor.birthdate),
|
||||
country: actor.country_alpha2,
|
||||
origin: (actor.birth_city || actor.birth_state || actor.birth_country_alpha2) ? {} : null,
|
||||
residence: (actor.residence_city || actor.residence_state || actor.residence_country_alpha2) ? {} : null,
|
||||
ethnicity: actor.ethnicity,
|
||||
height: actor.height,
|
||||
weight: actor.weight,
|
||||
bust: actor.bust,
|
||||
waist: actor.waist,
|
||||
hip: actor.hip,
|
||||
naturalBoobs: actor.natural_boobs,
|
||||
aliases: aliases.map(({ name }) => name),
|
||||
slug: actor.slug,
|
||||
avatar,
|
||||
photos,
|
||||
hasTattoos: actor.has_tattoos,
|
||||
hasPiercings: actor.has_piercings,
|
||||
tattoos: actor.tattoos,
|
||||
piercings: actor.piercings,
|
||||
social,
|
||||
scrapedAt: actor.scraped_at,
|
||||
};
|
||||
|
||||
if (curatedActor.birthdate) {
|
||||
curatedActor.age = moment().diff(curatedActor.birthdate, 'years');
|
||||
}
|
||||
|
||||
if (actor.birth_city) curatedActor.origin.city = actor.birth_city;
|
||||
if (actor.birth_state) curatedActor.origin.state = actor.birth_state;
|
||||
|
||||
if (actor.birth_country_alpha2) {
|
||||
curatedActor.origin.country = {
|
||||
alpha2: actor.birth_country_alpha2,
|
||||
name: actor.birth_country_name,
|
||||
alias: actor.birth_country_alias,
|
||||
};
|
||||
}
|
||||
|
||||
if (actor.residence_city) curatedActor.residence.city = actor.residence_city;
|
||||
if (actor.residence_state) curatedActor.residence.state = actor.residence_state;
|
||||
|
||||
if (actor.residence_country_alpha2) {
|
||||
curatedActor.residence.country = {
|
||||
alpha2: actor.residence_country_alpha2,
|
||||
name: actor.residence_country_name,
|
||||
alias: actor.residence_country_alias,
|
||||
};
|
||||
}
|
||||
|
||||
return curatedActor;
|
||||
}
|
||||
|
||||
function curateActors(releases) {
|
||||
return Promise.all(releases.map(async release => curateActor(release)));
|
||||
}
|
||||
|
||||
function curateActorEntry(actor, scraped, scrapeSuccess) {
|
||||
const curatedActor = {
|
||||
name: capitalize(actor.name),
|
||||
slug: slugify(actor.name),
|
||||
birthdate: actor.birthdate,
|
||||
description: actor.description,
|
||||
gender: actor.gender,
|
||||
ethnicity: actor.ethnicity,
|
||||
bust: actor.bust,
|
||||
waist: actor.waist,
|
||||
hip: actor.hip,
|
||||
natural_boobs: actor.naturalBoobs,
|
||||
height: actor.height,
|
||||
weight: actor.weight,
|
||||
hair: actor.hair,
|
||||
eyes: actor.eyes,
|
||||
has_tattoos: actor.hasTattoos,
|
||||
has_piercings: actor.hasPiercings,
|
||||
tattoos: actor.tattoos,
|
||||
piercings: actor.piercings,
|
||||
};
|
||||
|
||||
if (actor.id) {
|
||||
curatedActor.id = actor.id;
|
||||
}
|
||||
|
||||
if (actor.birthPlace) {
|
||||
curatedActor.birth_city = actor.birthPlace.city;
|
||||
curatedActor.birth_state = actor.birthPlace.state;
|
||||
curatedActor.birth_country_alpha2 = actor.birthPlace.country;
|
||||
}
|
||||
|
||||
if (actor.residencePlace) {
|
||||
curatedActor.residence_city = actor.residencePlace.city;
|
||||
curatedActor.residence_state = actor.residencePlace.state;
|
||||
curatedActor.residence_country_alpha2 = actor.residencePlace.country;
|
||||
}
|
||||
|
||||
if (scraped) {
|
||||
curatedActor.scraped_at = new Date();
|
||||
curatedActor.scrape_success = scrapeSuccess;
|
||||
}
|
||||
|
||||
return curatedActor;
|
||||
}
|
||||
|
||||
function curateSocialEntry(url, actorId) {
|
||||
const platforms = [
|
||||
// links supplied by PH often look like domain.com/domain.com/username
|
||||
{
|
||||
label: 'twitter',
|
||||
pattern: 'http(s)\\://(*)twitter.com/:username(/)(?*)',
|
||||
format: username => `https://www.twitter.com/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'youtube',
|
||||
pattern: 'http(s)\\://(*)youtube.com/channel/:username(?*)',
|
||||
format: username => `https://www.youtube.com/channel/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'instagram',
|
||||
pattern: 'http(s)\\://(*)instagram.com/:username(/)(?*)',
|
||||
format: username => `https://www.instagram.com/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'snapchat',
|
||||
pattern: 'http(s)\\://(*)snapchat.com/add/:username(/)(?*)',
|
||||
format: username => `https://www.snapchat.com/add/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'tumblr',
|
||||
pattern: 'http(s)\\://:username.tumblr.com(*)',
|
||||
format: username => `https://${username}.tumblr.com`,
|
||||
},
|
||||
{
|
||||
label: 'onlyfans',
|
||||
pattern: 'http(s)\\://(*)onlyfans.com/:username(/)(?*)',
|
||||
format: username => `https://www.onlyfans.com/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'fancentro',
|
||||
pattern: 'http(s)\\://(*)fancentro.com/:username(/)(?*)',
|
||||
format: username => `https://www.fancentro.com/${username}`,
|
||||
},
|
||||
{
|
||||
label: 'modelhub',
|
||||
pattern: 'http(s)\\://(*)modelhub.com/:username(/)(?*)',
|
||||
format: username => `https://www.modelhub.com/${username}`,
|
||||
},
|
||||
];
|
||||
|
||||
const match = platforms.reduce((acc, platform) => {
|
||||
if (acc) return acc;
|
||||
|
||||
const patternMatch = new UrlPattern(platform.pattern).match(url);
|
||||
|
||||
if (patternMatch) {
|
||||
return {
|
||||
platform: platform.label,
|
||||
original: url,
|
||||
username: patternMatch.username,
|
||||
url: platform.format ? platform.format(patternMatch.username) : url,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}, null) || { url };
|
||||
|
||||
return {
|
||||
url: match.url,
|
||||
platform: match.platform,
|
||||
actor_id: actorId,
|
||||
};
|
||||
}
|
||||
|
||||
async function curateSocialEntries(urls, actorId) {
|
||||
if (!urls) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const existingSocialLinks = await knex('actors_social').where('actor_id', actorId);
|
||||
|
||||
return urls.reduce((acc, url) => {
|
||||
const socialEntry = curateSocialEntry(url, actorId);
|
||||
|
||||
if (acc.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase()) || existingSocialLinks.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase())) {
|
||||
// prevent duplicates
|
||||
return acc;
|
||||
}
|
||||
|
||||
return [...acc, socialEntry];
|
||||
}, []);
|
||||
}
|
||||
|
||||
async function fetchActors(queryObject, limit = 100) {
|
||||
const releases = await knex('actors')
|
||||
.select(
|
||||
'actors.*',
|
||||
'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name', 'birth_countries.alias as birth_country_alias',
|
||||
'residence_countries.alpha2 as residence_country_alpha2', 'residence_countries.name as residence_country_name', 'residence_countries.alias as residence_country_alias',
|
||||
)
|
||||
.leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2')
|
||||
.leftJoin('countries as residence_countries', 'actors.residence_country_alpha2', 'residence_countries.alpha2')
|
||||
.orderBy(['actors.name', 'actors.gender'])
|
||||
.where(builder => whereOr(queryObject, 'actors', builder))
|
||||
.limit(limit);
|
||||
|
||||
return curateActors(releases);
|
||||
}
|
||||
|
||||
async function storeSocialLinks(urls, actorId) {
|
||||
const curatedSocialEntries = await curateSocialEntries(urls, actorId);
|
||||
|
||||
await knex('actors_social').insert(curatedSocialEntries);
|
||||
}
|
||||
|
||||
async function storeAvatars(avatars, actorId) {
|
||||
if (!avatars || avatars.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const avatarsBySource = await storeMedia(avatars, 'actor', 'avatar');
|
||||
await associateMedia({ [actorId]: avatars }, avatarsBySource, 'actor', 'photo', 'avatar');
|
||||
|
||||
return avatarsBySource;
|
||||
}
|
||||
|
||||
async function storeActor(actor, scraped = false, scrapeSuccess = false) {
|
||||
const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess);
|
||||
|
||||
const [actorEntry] = await knex('actors')
|
||||
.insert(curatedActor)
|
||||
.returning('*');
|
||||
|
||||
await storeSocialLinks(actor.social, actorEntry.id);
|
||||
|
||||
if (actor.avatars) {
|
||||
await storeAvatars(actor.avatars, actorEntry.id);
|
||||
}
|
||||
|
||||
logger.info(`Added new entry for actor '${actor.name}'`);
|
||||
|
||||
return actorEntry;
|
||||
}
|
||||
|
||||
async function updateActor(actor, scraped = false, scrapeSuccess = false) {
|
||||
const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess);
|
||||
|
||||
const [actorEntry] = await knex('actors')
|
||||
.where({ id: actor.id })
|
||||
.update(curatedActor)
|
||||
.returning('*');
|
||||
|
||||
await storeSocialLinks(actor.social, actor.id);
|
||||
|
||||
logger.info(`Updated entry for actor '${actor.name}'`);
|
||||
|
||||
return actorEntry;
|
||||
}
|
||||
|
||||
async function mergeProfiles(profiles, actor) {
|
||||
if (profiles.filter(Boolean).length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const mergedProfile = profiles.reduce((prevProfile, profile) => {
|
||||
if (profile === null) {
|
||||
return prevProfile;
|
||||
}
|
||||
|
||||
const accProfile = {
|
||||
id: actor ? actor.id : null,
|
||||
name: actor ? actor.name : (prevProfile.name || profile.name),
|
||||
description: prevProfile.description || profile.description,
|
||||
gender: prevProfile.gender || profile.gender,
|
||||
birthdate: !prevProfile.birthdate || Number.isNaN(Number(prevProfile.birthdate)) ? profile.birthdate : prevProfile.birthdate,
|
||||
birthPlace: prevProfile.birthPlace || profile.birthPlace,
|
||||
residencePlace: prevProfile.residencePlace || profile.residencePlace,
|
||||
nationality: prevProfile.nationality || profile.nationality, // used to derive country when not available
|
||||
ethnicity: prevProfile.ethnicity || profile.ethnicity,
|
||||
bust: prevProfile.bust || (/\d+\w+/.test(profile.bust) ? profile.bust : null),
|
||||
waist: prevProfile.waist || profile.waist,
|
||||
hip: prevProfile.hip || profile.hip,
|
||||
naturalBoobs: prevProfile.naturalBoobs === undefined ? profile.naturalBoobs : prevProfile.naturalBoobs,
|
||||
height: prevProfile.height || profile.height,
|
||||
weight: prevProfile.weight || profile.weight,
|
||||
hair: prevProfile.hair || profile.hair,
|
||||
eyes: prevProfile.eyes || profile.eyes,
|
||||
hasPiercings: prevProfile.hasPiercings === undefined ? profile.hasPiercings : prevProfile.hasPiercings,
|
||||
hasTattoos: prevProfile.hasTattoos === undefined ? profile.hasTattoos : prevProfile.hasTattoos,
|
||||
piercings: prevProfile.piercings || profile.piercings,
|
||||
tattoos: prevProfile.tattoos || profile.tattoos,
|
||||
social: prevProfile.social.concat(profile.social || []),
|
||||
releases: prevProfile.releases.concat(profile.releases ? profile.releases : []), // don't flatten fallbacks
|
||||
};
|
||||
|
||||
if (profile.avatar) {
|
||||
const avatar = Array.isArray(profile.avatar)
|
||||
? profile.avatar.map(avatarX => ({
|
||||
src: avatarX.src || avatarX,
|
||||
scraper: profile.scraper,
|
||||
copyright: avatarX.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
|
||||
}))
|
||||
: {
|
||||
src: profile.avatar.src || profile.avatar,
|
||||
scraper: profile.scraper,
|
||||
copyright: profile.avatar.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
|
||||
};
|
||||
|
||||
accProfile.avatars = prevProfile.avatars.concat([avatar]); // don't flatten fallbacks
|
||||
} else {
|
||||
accProfile.avatars = prevProfile.avatars;
|
||||
}
|
||||
|
||||
return accProfile;
|
||||
}, {
|
||||
social: [],
|
||||
avatars: [],
|
||||
releases: [],
|
||||
});
|
||||
|
||||
const [birthPlace, residencePlace] = await Promise.all([
|
||||
resolvePlace(mergedProfile.birthPlace),
|
||||
resolvePlace(mergedProfile.residencePlace),
|
||||
]);
|
||||
|
||||
mergedProfile.birthPlace = birthPlace;
|
||||
mergedProfile.residencePlace = residencePlace;
|
||||
|
||||
if (!mergedProfile.birthPlace && mergedProfile.nationality) {
|
||||
const country = await knex('countries')
|
||||
.where('nationality', 'ilike', `%${mergedProfile.nationality}%`)
|
||||
.orderBy('priority', 'desc')
|
||||
.first();
|
||||
|
||||
mergedProfile.birthPlace = {
|
||||
country: country.alpha2,
|
||||
};
|
||||
}
|
||||
|
||||
return mergedProfile;
|
||||
}
|
||||
|
||||
async function scrapeProfiles(sources, actorName, actorEntry, sitesBySlug) {
|
||||
return Promise.map(sources, async (source) => {
|
||||
// const [scraperSlug, scraper] = source;
|
||||
const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] }));
|
||||
|
||||
try {
|
||||
return await profileScrapers.reduce(async (outcome, { scraper, scraperSlug }) => outcome.catch(async () => {
|
||||
if (!scraper) {
|
||||
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
||||
throw Object.assign(new Error(`No profile scraper available for ${scraperSlug}`));
|
||||
}
|
||||
|
||||
logger.verbose(`Searching '${actorName}' on ${scraperSlug}`);
|
||||
|
||||
const site = sitesBySlug[scraperSlug] || null;
|
||||
const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, site, include);
|
||||
|
||||
if (profile && typeof profile !== 'number') {
|
||||
logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`);
|
||||
|
||||
return {
|
||||
...profile,
|
||||
name: actorName,
|
||||
scraper: scraperSlug,
|
||||
site,
|
||||
releases: profile.releases?.map(release => (typeof release === 'string'
|
||||
? { url: release, site }
|
||||
: { ...release, site: release.site || site }
|
||||
)),
|
||||
};
|
||||
}
|
||||
|
||||
logger.verbose(`No profile for '${actorName}' available on ${scraperSlug}: ${profile}`);
|
||||
throw Object.assign(new Error(`Profile for ${actorName} not available on ${scraperSlug}`), { warn: false });
|
||||
}), Promise.reject(new Error()));
|
||||
} catch (error) {
|
||||
if (error.warn !== false) {
|
||||
logger.warn(`Error in scraper ${source}: ${error.message}`);
|
||||
// logger.error(error.stack);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeActors(actorNames) {
|
||||
return Promise.map(actorNames || argv.actors, async (actorName) => {
|
||||
try {
|
||||
const actorSlug = slugify(actorName);
|
||||
const actorEntry = await knex('actors').where({ slug: actorSlug }).first();
|
||||
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
|
||||
|
||||
const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested
|
||||
|
||||
const [siteEntries, networkEntries] = await Promise.all([
|
||||
knex('sites')
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.select(
|
||||
'sites.*',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||
)
|
||||
.whereIn('sites.slug', finalSources.flat()),
|
||||
knex('networks').select('*').whereIn('slug', finalSources.flat()),
|
||||
]);
|
||||
|
||||
const sites = await curateSites(siteEntries, true);
|
||||
const networks = networkEntries.map(network => ({ ...network, isFallback: true }));
|
||||
const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
|
||||
|
||||
const profiles = await scrapeProfiles(sources, actorName, actorEntry, sitesBySlug);
|
||||
const profile = await mergeProfiles(profiles, actorEntry);
|
||||
|
||||
if (profile === null) {
|
||||
logger.warn(`Could not find profile for actor '${actorName}'`);
|
||||
|
||||
if (argv.save && !actorEntry) {
|
||||
await storeActor({ name: actorName }, false, false);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
if (argv.inspect) {
|
||||
console.log(profile);
|
||||
logger.info(`Found ${profile.releases.length} releases for ${actorName}`);
|
||||
}
|
||||
|
||||
if (argv.save) {
|
||||
if (actorEntry && profile) {
|
||||
await Promise.all([
|
||||
updateActor(profile, true, true),
|
||||
storeAvatars(profile.avatars, actorEntry.id),
|
||||
]);
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
await storeActor(profile, true, true);
|
||||
}
|
||||
|
||||
return profile;
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
logger.warn(`${actorName}: ${error}`);
|
||||
|
||||
return null;
|
||||
}
|
||||
}, {
|
||||
concurrency: 3,
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeBasicActors() {
|
||||
const basicActors = await knex('actors').where('scraped_at', null);
|
||||
|
||||
return scrapeActors(basicActors.map(actor => actor.name));
|
||||
}
|
||||
|
||||
async function associateActors(mappedActors, releases) {
|
||||
const [existingActorEntries, existingAssociationEntries] = await Promise.all([
|
||||
knex('actors')
|
||||
.whereIn('name', Object.values(mappedActors).map(actor => actor.name))
|
||||
.orWhereIn('slug', Object.keys(mappedActors)),
|
||||
knex('releases_actors').whereIn('release_id', releases.map(release => release.id)),
|
||||
]);
|
||||
|
||||
const associations = await Promise.map(Object.entries(mappedActors), async ([actorSlug, actor]) => {
|
||||
try {
|
||||
const actorEntry = existingActorEntries.find(actorX => actorX.slug === actorSlug)
|
||||
|| await storeActor(actor);
|
||||
|
||||
// if a scene
|
||||
return Array.from(actor.releaseIds)
|
||||
.map(releaseId => ({
|
||||
release_id: releaseId,
|
||||
actor_id: actorEntry.id,
|
||||
}))
|
||||
.filter(association => !existingAssociationEntries
|
||||
// remove associations already in database
|
||||
.some(associationEntry => associationEntry.actor_id === association.actor_id
|
||||
&& associationEntry.release_id === association.release_id));
|
||||
} catch (error) {
|
||||
logger.error(actor.name, error);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
|
||||
await knex('releases_actors').insert(associations.filter(association => association).flat());
|
||||
|
||||
// basic actor scraping is failure prone, don't run together with actor association
|
||||
// await scrapebasicactors(),
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
associateActors,
|
||||
fetchActors,
|
||||
scrapeActors,
|
||||
scrapeBasicActors,
|
||||
};
|
||||
@@ -119,7 +119,7 @@ function toBaseActors(actorsOrNames, release) {
|
||||
const baseActor = {
|
||||
name,
|
||||
slug,
|
||||
entity: release?.site?.network || release?.entity?.parent || null,
|
||||
entity: release?.site?.network || release?.entity?.parent || release?.entity || null,
|
||||
};
|
||||
|
||||
if (actorOrName.name) {
|
||||
|
||||
@@ -16,7 +16,7 @@ function curateEntity(entity, includeParameters = false) {
|
||||
slug: entity.slug,
|
||||
type: entity.type,
|
||||
parameters: includeParameters ? entity.parameters : null,
|
||||
parent: entity.parent,
|
||||
parent: entity.parent_id && entity.parent,
|
||||
children: (entity.children || []).map(child => curateEntity({
|
||||
...child,
|
||||
parent: entity,
|
||||
|
||||
@@ -1,507 +0,0 @@
|
||||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
const moment = require('moment');
|
||||
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const argv = require('./argv');
|
||||
const whereOr = require('./utils/where-or');
|
||||
const { associateTags } = require('./tags');
|
||||
const { associateActors, scrapeBasicActors } = require('./actors');
|
||||
const {
|
||||
pluckItems,
|
||||
storeMedia,
|
||||
associateMedia,
|
||||
} = require('./media');
|
||||
const { fetchSites } = require('./sites');
|
||||
const slugify = require('./utils/slugify');
|
||||
const capitalize = require('./utils/capitalize');
|
||||
|
||||
function commonQuery(queryBuilder, {
|
||||
filter = [],
|
||||
after = new Date(0), // January 1970
|
||||
before = new Date(2 ** 44), // May 2109
|
||||
limit = 100,
|
||||
}) {
|
||||
const finalFilter = [].concat(filter); // ensure filter is array
|
||||
|
||||
queryBuilder
|
||||
.leftJoin('sites', 'releases.site_id', 'sites.id')
|
||||
.leftJoin('studios', 'releases.studio_id', 'studios.id')
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.select(
|
||||
'releases.*',
|
||||
'sites.name as site_name', 'sites.slug as site_slug', 'sites.url as site_url', 'sites.network_id', 'sites.parameters as site_parameters',
|
||||
'studios.name as studio_name', 'sites.slug as site_slug', 'studios.url as studio_url',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description',
|
||||
)
|
||||
.whereNotExists((builder) => {
|
||||
// apply tag filters
|
||||
builder
|
||||
.select('*')
|
||||
.from('tags_associated')
|
||||
.leftJoin('tags', 'tags_associated.tag_id', 'tags.id')
|
||||
.whereIn('tags.slug', finalFilter)
|
||||
.where('tags_associated.domain', 'releases')
|
||||
.whereRaw('tags_associated.target_id = releases.id');
|
||||
})
|
||||
.andWhere('releases.date', '>', after)
|
||||
.andWhere('releases.date', '<=', before)
|
||||
.orderBy([{ column: 'date', order: 'desc' }, { column: 'created_at', order: 'desc' }])
|
||||
.limit(limit);
|
||||
}
|
||||
|
||||
async function curateRelease(release) {
|
||||
const [actors, tags, media] = await Promise.all([
|
||||
knex('actors_associated')
|
||||
.select(
|
||||
'actors.id', 'actors.name', 'actors.gender', 'actors.slug', 'actors.birthdate',
|
||||
'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name', 'birth_countries.alias as birth_country_alias',
|
||||
'media.thumbnail as avatar',
|
||||
)
|
||||
.where({ release_id: release.id })
|
||||
.leftJoin('actors', 'actors.id', 'actors_associated.actor_id')
|
||||
.leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2')
|
||||
.leftJoin('media', (builder) => {
|
||||
builder
|
||||
.on('media.target_id', 'actors.id')
|
||||
.andOnVal('media.domain', 'actors')
|
||||
.andOnVal('media.index', '0');
|
||||
})
|
||||
.orderBy('actors.gender'),
|
||||
knex('tags_associated')
|
||||
.select('tags.name', 'tags.slug')
|
||||
.where({
|
||||
domain: 'releases',
|
||||
target_id: release.id,
|
||||
})
|
||||
.leftJoin('tags', 'tags.id', 'tags_associated.tag_id')
|
||||
.orderBy('tags.priority', 'desc'),
|
||||
knex('media')
|
||||
.where({
|
||||
target_id: release.id,
|
||||
domain: 'releases',
|
||||
})
|
||||
.orderBy(['role', 'index']),
|
||||
]);
|
||||
|
||||
const curatedRelease = {
|
||||
id: release.id,
|
||||
type: release.type,
|
||||
title: release.title,
|
||||
date: release.date,
|
||||
dateAdded: release.created_at,
|
||||
description: release.description,
|
||||
url: release.url,
|
||||
shootId: release.shoot_id,
|
||||
entryId: release.entry_id,
|
||||
actors: actors.map(actor => ({
|
||||
id: actor.id,
|
||||
slug: actor.slug,
|
||||
name: actor.name,
|
||||
gender: actor.gender,
|
||||
birthdate: actor.birthdate,
|
||||
age: moment().diff(actor.birthdate, 'years'),
|
||||
ageThen: moment(release.date).diff(actor.birthdate, 'years'),
|
||||
avatar: actor.avatar,
|
||||
origin: actor.birth_country_alpha2
|
||||
? {
|
||||
country: {
|
||||
name: actor.birth_country_alias,
|
||||
alpha2: actor.birth_country_alpha2,
|
||||
},
|
||||
}
|
||||
: null,
|
||||
})),
|
||||
director: release.director,
|
||||
tags,
|
||||
duration: release.duration,
|
||||
photos: media.filter(item => item.role === 'photo'),
|
||||
poster: media.filter(item => item.role === 'poster')[0],
|
||||
covers: media.filter(item => item.role === 'cover'),
|
||||
trailer: media.filter(item => item.role === 'trailer')[0],
|
||||
site: {
|
||||
id: release.site_id,
|
||||
name: release.site_name,
|
||||
independent: !!release.site_parameters?.independent,
|
||||
slug: release.site_slug,
|
||||
url: release.site_url,
|
||||
},
|
||||
studio: release.studio_id
|
||||
? {
|
||||
id: release.studio_id,
|
||||
name: release.studio_name,
|
||||
slug: release.studio_slug,
|
||||
url: release.studio_url,
|
||||
}
|
||||
: null,
|
||||
network: {
|
||||
id: release.network_id,
|
||||
name: release.network_name,
|
||||
description: release.network_description,
|
||||
slug: release.network_slug,
|
||||
url: release.network_url,
|
||||
},
|
||||
};
|
||||
|
||||
return curatedRelease;
|
||||
}
|
||||
|
||||
function curateReleases(releases) {
|
||||
return Promise.all(releases.map(async release => curateRelease(release)));
|
||||
}
|
||||
|
||||
async function attachChannelSite(release) {
|
||||
if (!release.site?.isFallback && !release.channel?.force) {
|
||||
return release;
|
||||
}
|
||||
|
||||
if (!release.channel) {
|
||||
throw new Error(`Unable to derive channel site from generic URL: ${release.url}`);
|
||||
}
|
||||
|
||||
const [site] = await fetchSites({
|
||||
name: release.channel.name || release.channel,
|
||||
slug: release.channel.slug || release.channel,
|
||||
});
|
||||
|
||||
if (site) {
|
||||
return {
|
||||
...release,
|
||||
site,
|
||||
};
|
||||
}
|
||||
|
||||
throw new Error(`Unable to match channel '${release.channel.slug || release.channel}' from generic URL: ${release.url}`);
|
||||
}
|
||||
|
||||
async function attachStudio(release) {
|
||||
if (!release.studio) {
|
||||
return release;
|
||||
}
|
||||
|
||||
const studio = await knex('studios')
|
||||
.where('name', release.studio)
|
||||
.orWhere('slug', release.studio)
|
||||
.orWhere('url', release.studio)
|
||||
.first();
|
||||
|
||||
return {
|
||||
...release,
|
||||
studio,
|
||||
};
|
||||
}
|
||||
|
||||
async function curateReleaseEntry(release, batchId, existingRelease) {
|
||||
const slug = slugify(release.title, {
|
||||
encode: true,
|
||||
limit: config.titleSlugLength,
|
||||
});
|
||||
|
||||
const curatedRelease = {
|
||||
site_id: release.site.id,
|
||||
studio_id: release.studio ? release.studio.id : null,
|
||||
shoot_id: release.shootId || null,
|
||||
entry_id: release.entryId || null,
|
||||
type: release.type,
|
||||
url: release.url,
|
||||
title: release.title,
|
||||
slug,
|
||||
date: release.date,
|
||||
description: release.description,
|
||||
// director: release.director,
|
||||
duration: release.duration,
|
||||
// likes: release.rating && release.rating.likes,
|
||||
// dislikes: release.rating && release.rating.dislikes,
|
||||
// rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
|
||||
deep: typeof release.deep === 'boolean' ? release.deep : false,
|
||||
deep_url: release.deepUrl,
|
||||
updated_batch_id: batchId,
|
||||
...(!existingRelease && { created_batch_id: batchId }),
|
||||
};
|
||||
|
||||
return curatedRelease;
|
||||
}
|
||||
|
||||
async function fetchReleases(queryObject = {}, options = {}) {
|
||||
const releases = await knex('releases')
|
||||
.modify(commonQuery, options)
|
||||
.andWhere(builder => whereOr(queryObject, 'releases', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function fetchSiteReleases(queryObject, options = {}) {
|
||||
const releases = await knex('releases')
|
||||
.modify(commonQuery, options)
|
||||
.where(builder => whereOr(queryObject, 'sites', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function fetchNetworkReleases(queryObject, options = {}) {
|
||||
const releases = await knex('releases')
|
||||
.modify(commonQuery, options)
|
||||
.where(builder => whereOr(queryObject, 'networks', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function fetchActorReleases(queryObject, options = {}) {
|
||||
const releases = await knex('actors_associated')
|
||||
.leftJoin('releases', 'actors_associated.release_id', 'releases.id')
|
||||
.leftJoin('actors', 'actors_associated.actor_id', 'actors.id')
|
||||
.select(
|
||||
'actors.name as actor_name',
|
||||
)
|
||||
.modify(commonQuery, options)
|
||||
.where(builder => whereOr(queryObject, 'actors', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function fetchTagReleases(queryObject, options = {}) {
|
||||
const releases = await knex('tags_associated')
|
||||
.leftJoin('releases', 'tags_associated.target_id', 'releases.id')
|
||||
.leftJoin('tags', 'tags_associated.tag_id', 'tags.id')
|
||||
.select(
|
||||
'tags.name as tag_name',
|
||||
)
|
||||
.modify(commonQuery, options)
|
||||
.where('tags_associated.domain', 'releases')
|
||||
.where(builder => whereOr(queryObject, 'tags', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
function accumulateActors(releases) {
|
||||
return releases.reduce((acc, release) => {
|
||||
if (!Array.isArray(release.actors)) return acc;
|
||||
|
||||
release.actors.forEach((actor) => {
|
||||
const actorName = actor.name ? actor.name.trim() : actor.trim();
|
||||
const actorSlug = slugify(actorName);
|
||||
|
||||
if (!actorSlug) return;
|
||||
|
||||
if (!acc[actorSlug]) {
|
||||
acc[actorSlug] = {
|
||||
name: actorName,
|
||||
slug: actorSlug,
|
||||
releaseIds: new Set(),
|
||||
avatars: [],
|
||||
};
|
||||
}
|
||||
|
||||
acc[actorSlug].releaseIds.add(release.id);
|
||||
|
||||
if (actor.name) acc[actorSlug] = { ...acc[actorSlug], ...actor }; // actor input contains profile info
|
||||
if (actor.avatar) {
|
||||
const avatar = Array.isArray(actor.avatar)
|
||||
? actor.avatar.map(avatarX => ({
|
||||
src: avatarX.src || avatarX,
|
||||
copyright: avatarX.copyright === undefined ? capitalize(release.site?.network?.name) : avatarX.copyright,
|
||||
}))
|
||||
: {
|
||||
src: actor.avatar.src || actor.avatar,
|
||||
copyright: actor.avatar.copyright === undefined ? capitalize(release.site?.network?.name) : actor.avatar.copyright,
|
||||
};
|
||||
|
||||
acc[actorSlug].avatars = acc[actorSlug].avatars.concat([avatar]); // don't flatten fallbacks
|
||||
}
|
||||
});
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
}
|
||||
|
||||
async function storeReleaseAssets(releases) {
|
||||
if (!argv.media) {
|
||||
return;
|
||||
}
|
||||
|
||||
const releasePostersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.poster] }), {});
|
||||
const releaseCoversById = releases.reduce((acc, release) => ({ ...acc, [release.id]: release.covers }), {});
|
||||
const releaseTrailersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.trailer] }), {});
|
||||
const releaseTeasersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.teaser] }), {});
|
||||
const releasePhotosById = releases.reduce((acc, release) => ({
|
||||
...acc,
|
||||
[release.id]: pluckItems(release.photos),
|
||||
}), {});
|
||||
|
||||
if (argv.images && argv.posters) {
|
||||
const posters = await storeMedia(Object.values(releasePostersById).flat(), 'release', 'poster');
|
||||
if (posters) await associateMedia(releasePostersById, posters, 'release', 'poster');
|
||||
}
|
||||
|
||||
if (argv.images && argv.covers) {
|
||||
const covers = await storeMedia(Object.values(releaseCoversById).flat(), 'release', 'cover');
|
||||
if (covers) await associateMedia(releaseCoversById, covers, 'release', 'cover');
|
||||
}
|
||||
|
||||
if (argv.images && argv.photos) {
|
||||
const photos = await storeMedia(Object.values(releasePhotosById).flat(), 'release', 'photo');
|
||||
if (photos) await associateMedia(releasePhotosById, photos, 'release', 'photo');
|
||||
}
|
||||
|
||||
if (argv.videos && argv.trailers) {
|
||||
const trailers = await storeMedia(Object.values(releaseTrailersById).flat(), 'release', 'trailer');
|
||||
if (trailers) await associateMedia(releaseTrailersById, trailers, 'release', 'trailer');
|
||||
}
|
||||
|
||||
if (argv.videos && argv.teasers) {
|
||||
const teasers = await storeMedia(Object.values(releaseTeasersById).flat(), 'release', 'teaser');
|
||||
if (teasers) await associateMedia(releaseTeasersById, teasers, 'release', 'teaser');
|
||||
}
|
||||
}
|
||||
|
||||
async function updateReleasesSearch(releaseIds) {
|
||||
logger.info(`Updating search documents for ${releaseIds ? releaseIds.length : 'all' } releases`);
|
||||
|
||||
const documents = await knex.raw(`
|
||||
SELECT
|
||||
releases.id AS release_id,
|
||||
TO_TSVECTOR(
|
||||
'traxxx',
|
||||
releases.title || ' ' ||
|
||||
networks.name || ' ' ||
|
||||
networks.slug || ' ' ||
|
||||
networks.url || ' ' ||
|
||||
sites.name || ' ' ||
|
||||
sites.slug || ' ' ||
|
||||
COALESCE(sites.url, '') || ' ' ||
|
||||
COALESCE(sites.alias, '') || ' ' ||
|
||||
COALESCE(releases.shoot_id, '') || ' ' ||
|
||||
COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMDD'), '') || ' ' ||
|
||||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
|
||||
STRING_AGG(COALESCE(tags.name, ''), ' ') || ' ' ||
|
||||
STRING_AGG(COALESCE(tags_aliases.name, ''), ' ')
|
||||
) as document
|
||||
FROM releases
|
||||
LEFT JOIN sites ON releases.site_id = sites.id
|
||||
LEFT JOIN networks ON sites.network_id = networks.id
|
||||
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id
|
||||
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id
|
||||
LEFT JOIN actors ON local_actors.actor_id = actors.id
|
||||
LEFT JOIN tags ON local_tags.tag_id = tags.id
|
||||
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for
|
||||
${releaseIds ? 'WHERE releases.id = ANY(?)' : ''}
|
||||
GROUP BY releases.id, sites.name, sites.slug, sites.alias, sites.url, networks.name, networks.slug, networks.url;
|
||||
`, releaseIds && [releaseIds]);
|
||||
|
||||
if (documents.rows?.length > 0) {
|
||||
const query = knex('releases_search').insert(documents.rows).toString();
|
||||
await knex.raw(`${query} ON CONFLICT (release_id) DO UPDATE SET document = EXCLUDED.document`);
|
||||
}
|
||||
}
|
||||
|
||||
async function storeRelease(release, batchId) {
|
||||
if (!release.site) {
|
||||
throw new Error(`Missing site, unable to store "${release.title}" (${release.url})`);
|
||||
}
|
||||
|
||||
if (!release.entryId) {
|
||||
logger.warn(`Missing entry ID, unable to store "${release.title}" (${release.url})`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const existingRelease = await knex('releases')
|
||||
.where({
|
||||
entry_id: release.entryId,
|
||||
site_id: release.site.id,
|
||||
})
|
||||
.first();
|
||||
|
||||
const curatedRelease = await curateReleaseEntry(release, batchId, existingRelease);
|
||||
|
||||
if (existingRelease && !argv.redownload) {
|
||||
return existingRelease;
|
||||
}
|
||||
|
||||
if (existingRelease && argv.redownload) {
|
||||
const [updatedRelease] = await knex('releases')
|
||||
.where('id', existingRelease.id)
|
||||
.update({
|
||||
...existingRelease,
|
||||
...curatedRelease,
|
||||
})
|
||||
.returning('*');
|
||||
|
||||
if (updatedRelease) {
|
||||
await associateTags(release, updatedRelease.id);
|
||||
logger.info(`Updated release "${release.title}" (${existingRelease.id}, ${release.site.name})`);
|
||||
}
|
||||
|
||||
await associateTags(release, existingRelease.id);
|
||||
|
||||
return existingRelease;
|
||||
}
|
||||
|
||||
const [releaseEntry] = await knex('releases')
|
||||
.insert(curatedRelease)
|
||||
.returning('*');
|
||||
|
||||
await associateTags(release, releaseEntry.id);
|
||||
|
||||
logger.info(`Stored release "${release.title}" (${releaseEntry.id}, ${release.site.name})`);
|
||||
|
||||
return releaseEntry;
|
||||
}
|
||||
|
||||
async function storeReleases(releases) {
|
||||
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
|
||||
|
||||
const storedReleases = await Promise.map(releases, async (release) => {
|
||||
try {
|
||||
const releaseWithChannelSite = await attachChannelSite(release);
|
||||
const releaseWithStudio = await attachStudio(releaseWithChannelSite);
|
||||
const storedRelease = await storeRelease(releaseWithStudio, batchId);
|
||||
|
||||
return storedRelease && {
|
||||
id: storedRelease.id,
|
||||
slug: storedRelease.slug,
|
||||
...releaseWithChannelSite,
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error(error);
|
||||
|
||||
return null;
|
||||
}
|
||||
}, {
|
||||
concurrency: 10,
|
||||
}).filter(Boolean);
|
||||
|
||||
logger.info(`Stored ${storedReleases.length} new releases`);
|
||||
|
||||
const actors = accumulateActors(storedReleases);
|
||||
|
||||
await associateActors(actors, storedReleases);
|
||||
|
||||
await Promise.all([
|
||||
// actors need to be stored before generating search
|
||||
updateReleasesSearch(storedReleases.map(release => release.id)),
|
||||
storeReleaseAssets(storedReleases),
|
||||
]);
|
||||
|
||||
if (argv.withProfiles && Object.keys(actors).length > 0) {
|
||||
await scrapeBasicActors();
|
||||
}
|
||||
|
||||
return {
|
||||
releases: storedReleases,
|
||||
actors,
|
||||
};
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchReleases,
|
||||
fetchActorReleases,
|
||||
fetchSiteReleases,
|
||||
fetchNetworkReleases,
|
||||
fetchTagReleases,
|
||||
storeRelease,
|
||||
storeReleases,
|
||||
updateReleasesSearch,
|
||||
};
|
||||
@@ -165,7 +165,7 @@ async function scrapeChannelReleases(scraper, channelEntity, preData) {
|
||||
: [],
|
||||
]);
|
||||
|
||||
logger.info(`Fetching ${latestReleases.length} latest and ${upcomingReleases.length} upcoming updates for '${channelEntity.name}' (${channelEntity.parent.name})`);
|
||||
logger.info(`Fetching ${latestReleases.length} latest and ${upcomingReleases.length} upcoming updates for '${channelEntity.name}' (${channelEntity.parent?.name})`);
|
||||
|
||||
return [...latestReleases, ...upcomingReleases];
|
||||
}
|
||||
@@ -176,7 +176,7 @@ async function scrapeChannel(channelEntity, accNetworkReleases) {
|
||||
|| scrapers.releases[channelEntity.parent?.parent?.slug];
|
||||
|
||||
if (!scraper) {
|
||||
logger.warn(`No scraper found for '${channelEntity.name}' (${channelEntity.parent.name})`);
|
||||
logger.warn(`No scraper found for '${channelEntity.name}' (${channelEntity.parent?.name})`);
|
||||
return [];
|
||||
}
|
||||
|
||||
@@ -201,7 +201,7 @@ async function scrapeNetworkSequential(networkEntity) {
|
||||
networkEntity.children,
|
||||
async (chain, channelEntity) => {
|
||||
const accNetworkReleases = await chain;
|
||||
const channelReleases = await scrapeChannel(channelEntity, networkEntity, accNetworkReleases);
|
||||
const channelReleases = await scrapeChannel(channelEntity, accNetworkReleases);
|
||||
|
||||
return accNetworkReleases.concat(channelReleases);
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user