Compare commits

..

No commits in common. "4959dfd14fdf84f8c0ef2613649ed8ea654ce475" and "1907ce1e541dd75fc5a02e1bc23d18ff1b50e353" have entirely different histories.

17 changed files with 231 additions and 195 deletions

View File

@ -45,7 +45,6 @@ async function mounted() {
'double-penetration', 'double-penetration',
'facial', 'facial',
'creampie', 'creampie',
'squirting',
], ],
appearance: [ appearance: [
'asian', 'asian',
@ -101,7 +100,6 @@ async function mounted() {
], ],
misc: [ misc: [
'gaping', 'gaping',
'squirting',
'oil', 'oil',
], ],
}; };

View File

@ -57,7 +57,7 @@ function initActorActions(store, _router) {
description description
createdAt createdAt
updatedAt updatedAt
network: entity { network {
id id
name name
slug slug
@ -80,7 +80,12 @@ function initActorActions(store, _router) {
profiles: actorsProfiles { profiles: actorsProfiles {
description description
descriptionHash descriptionHash
network: entity { network {
id
slug
name
}
site {
id id
slug slug
name name
@ -157,12 +162,12 @@ function initActorActions(store, _router) {
${releaseActorsFragment} ${releaseActorsFragment}
${releaseTagsFragment} ${releaseTagsFragment}
${releasePosterFragment} ${releasePosterFragment}
site: entity { site {
id id
name name
slug slug
url url
network: parent { network {
id id
name name
slug slug
@ -260,7 +265,7 @@ function initActorActions(store, _router) {
dateOfBirth dateOfBirth
dateOfDeath dateOfDeath
gender gender
network: entity { network {
id id
name name
slug slug

View File

@ -1,10 +1,11 @@
const siteFragment = ` const siteFragment = `
site: entity { site {
id id
name name
slug slug
url url
network: parent { independent
network {
id id
name name
slug slug
@ -19,6 +20,7 @@ const sitesFragment = `
name name
slug slug
url url
independent
network { network {
id id
name name
@ -47,7 +49,7 @@ const actorFields = `
lazy lazy
} }
} }
network: entity { network {
id id
name name
slug slug

View File

@ -326,9 +326,9 @@ exports.up = knex => Promise.resolve()
table.text('real_name'); table.text('real_name');
table.integer('entity_id', 12) table.integer('network_id', 12)
.references('id') .references('id')
.inTable('entities'); .inTable('networks');
table.integer('alias_for', 12) table.integer('alias_for', 12)
.references('id') .references('id')
@ -393,11 +393,15 @@ exports.up = knex => Promise.resolve()
.references('id') .references('id')
.inTable('actors'); .inTable('actors');
table.integer('entity_id', 12) table.integer('network_id', 12)
.references('id') .references('id')
.inTable('entities'); .inTable('networks');
table.unique(['actor_id', 'entity_id']); table.integer('site_id', 12)
.references('id')
.inTable('sites');
table.unique(['actor_id', 'network_id', 'site_id']);
table.integer('priority', 4) table.integer('priority', 4)
.defaultTo(1); .defaultTo(1);
@ -676,10 +680,13 @@ exports.up = knex => Promise.resolve()
.then(() => knex.schema.createTable('releases', (table) => { .then(() => knex.schema.createTable('releases', (table) => {
table.increments('id', 16); table.increments('id', 16);
table.integer('entity_id', 12) table.integer('site_id', 12)
.references('id') .references('id')
.inTable('entities') .inTable('sites');
.notNullable();
table.integer('network_id', 12)
.references('id')
.inTable('networks');
table.integer('studio_id', 12) table.integer('studio_id', 12)
.references('id') .references('id')
@ -690,7 +697,7 @@ exports.up = knex => Promise.resolve()
table.text('shoot_id'); table.text('shoot_id');
table.text('entry_id'); table.text('entry_id');
table.unique(['entity_id', 'entry_id', 'type']); table.unique(['site_id', 'network_id', 'entry_id', 'type']);
table.text('url', 1000); table.text('url', 1000);
table.text('title'); table.text('title');
@ -849,6 +856,15 @@ exports.up = knex => Promise.resolve()
.then(() => { // eslint-disable-line arrow-body-style .then(() => { // eslint-disable-line arrow-body-style
// allow vim fold // allow vim fold
return knex.raw(` return knex.raw(`
ALTER TABLE releases
ADD CONSTRAINT ensure_site_or_network CHECK (site_id IS NOT NULL OR network_id IS NOT NULL);
ALTER TABLE releases_search
ADD COLUMN document tsvector;
CREATE UNIQUE INDEX unique_actor_slugs_network ON actors (slug, network_id);
CREATE UNIQUE INDEX unique_actor_slugs ON actors (slug, (network_id IS NULL));
CREATE TEXT SEARCH DICTIONARY traxxx_dict ( CREATE TEXT SEARCH DICTIONARY traxxx_dict (
TEMPLATE = pg_catalog.simple, TEMPLATE = pg_catalog.simple,
stopwords = traxxx stopwords = traxxx
@ -858,12 +874,6 @@ exports.up = knex => Promise.resolve()
COPY = english COPY = english
); );
ALTER TABLE releases_search
ADD COLUMN document tsvector;
CREATE UNIQUE INDEX unique_actor_slugs_network ON actors (slug, entity_id);
CREATE UNIQUE INDEX unique_actor_slugs ON actors (slug, (entity_id IS NULL));
ALTER TEXT SEARCH CONFIGURATION traxxx ALTER TEXT SEARCH CONFIGURATION traxxx
ALTER MAPPING FOR word, numword, hword, numhword, hword_part, hword_numpart, asciiword, asciihword, hword_asciipart WITH traxxx_dict, simple, english_stem; ALTER MAPPING FOR word, numword, hword, numhword, hword_part, hword_numpart, asciiword, asciihword, hword_asciipart WITH traxxx_dict, simple, english_stem;

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 492 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 27 KiB

View File

@ -635,7 +635,6 @@ const tagPosters = [
['piercings', 0, 'Kaegune in "When The Sun Goes Down" for Suicide Girls'], ['piercings', 0, 'Kaegune in "When The Sun Goes Down" for Suicide Girls'],
['pussy-eating', 0, 'Kali Roses licking Emily Willis\' pussy in "Peeping On My Neighbor" for Girl Girl'], ['pussy-eating', 0, 'Kali Roses licking Emily Willis\' pussy in "Peeping On My Neighbor" for Girl Girl'],
['redhead', 1, 'Lacy Lennon in "Girl Crush" for When Girls Play'], ['redhead', 1, 'Lacy Lennon in "Girl Crush" for When Girls Play'],
['squirting', 0, 'Veronica Rodriguez in "Hot Latina Squirting" for Jules Jordan'],
['schoolgirl', 1, 'Eliza Ibarra for Brazzers'], ['schoolgirl', 1, 'Eliza Ibarra for Brazzers'],
['swallowing', 'poster'], ['swallowing', 'poster'],
['teen', 0, 'Eva Elfie in "Fresh New Talent" for Club Seventeen'], ['teen', 0, 'Eva Elfie in "Fresh New Talent" for Club Seventeen'],

View File

@ -20,6 +20,7 @@ const logger = require('./logger')(__filename);
const { toBaseReleases } = require('./deep'); const { toBaseReleases } = require('./deep');
const { associateAvatars } = require('./media'); const { associateAvatars } = require('./media');
const { curateSite } = require('./sites');
const slugify = require('./utils/slugify'); const slugify = require('./utils/slugify');
const capitalize = require('./utils/capitalize'); const capitalize = require('./utils/capitalize');
@ -119,7 +120,7 @@ function toBaseActors(actorsOrNames, release) {
const baseActor = { const baseActor = {
name, name,
slug, slug,
entity: release?.site?.network || release?.entity?.parent || null, network: release?.site.network,
}; };
if (actorOrName.name) { if (actorOrName.name) {
@ -143,7 +144,7 @@ function curateActor(actor, withDetails = false) {
name: actor.name, name: actor.name,
slug: actor.slug, slug: actor.slug,
gender: actor.gender, gender: actor.gender,
entityId: actor.entity_id, networkId: actor.network_id,
aliasFor: actor.alias_for, aliasFor: actor.alias_for,
dateOfBirth: actor.date_of_birth, dateOfBirth: actor.date_of_birth,
birthCountry: actor.birth_country_alpha2, birthCountry: actor.birth_country_alpha2,
@ -154,10 +155,10 @@ function curateActor(actor, withDetails = false) {
slug: actor.slug, slug: actor.slug,
gender: actor.alias.gender, gender: actor.alias.gender,
}, },
entity: actor.entity && { network: actor.network && {
id: actor.entity.id, id: actor.network.id,
name: actor.entity.name, name: actor.network.name,
slug: actor.entity.slug, slug: actor.network.slug,
}, },
dateOfDeath: actor.date_of_death, dateOfDeath: actor.date_of_death,
cup: actor.cup, cup: actor.cup,
@ -210,7 +211,7 @@ function curateActorEntry(baseActor, batchId) {
return { return {
name: baseActor.name, name: baseActor.name,
slug: baseActor.slug, slug: baseActor.slug,
entity_id: null, network_id: null,
batch_id: batchId, batch_id: batchId,
}; };
} }
@ -223,7 +224,8 @@ function curateProfileEntry(profile) {
const curatedProfileEntry = { const curatedProfileEntry = {
...(profile.update !== false && { id: profile.update }), ...(profile.update !== false && { id: profile.update }),
actor_id: profile.id, actor_id: profile.id,
entity_id: profile.entity?.id || null, site_id: profile.site?.id || null,
network_id: profile.network?.id || null,
date_of_birth: profile.dateOfBirth, date_of_birth: profile.dateOfBirth,
date_of_death: profile.dateOfDeath, date_of_death: profile.dateOfDeath,
gender: profile.gender, gender: profile.gender,
@ -266,7 +268,8 @@ async function curateProfile(profile) {
name: profile.name, name: profile.name,
avatar: profile.avatar, avatar: profile.avatar,
scraper: profile.scraper, scraper: profile.scraper,
entity: profile.entity, site: profile.site,
network: profile.network,
update: profile.update, update: profile.update,
}; };
@ -340,7 +343,7 @@ async function curateProfile(profile) {
const { href } = new URL(social); const { href } = new URL(social);
return href; return href;
} catch (error) { } catch (error) {
logger.warn(`Profile scraper for '${profile.entity.name}' returned invalid social link: ${social}`); logger.warn(`Profile scraper for '${profile.site.name}' returned invalid social link: ${social}`);
return null; return null;
} }
}).filter(Boolean) }).filter(Boolean)
@ -348,9 +351,9 @@ async function curateProfile(profile) {
curatedProfile.releases = toBaseReleases(profile.releases); curatedProfile.releases = toBaseReleases(profile.releases);
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`); if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.ethnicity}`);
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`); if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.hairColor || profile.hair}`);
if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.entity.name}' scraper: ${profile.eyes}`); if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.eyes}`);
return curatedProfile; return curatedProfile;
} catch (error) { } catch (error) {
@ -496,7 +499,7 @@ async function upsertProfiles(profiles) {
} }
} }
async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId) { async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId) {
const profiles = Promise.map(sources, async (source) => { const profiles = Promise.map(sources, async (source) => {
try { try {
// config may group sources to try until success // config may group sources to try until success
@ -504,25 +507,24 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
try { try {
const scraper = scrapers[scraperSlug]; const scraper = scrapers[scraperSlug];
const context = { const context = {
site: entitiesBySlug[scraperSlug] || null, site: sitesBySlug[scraperSlug] || null,
network: entitiesBySlug[scraperSlug] || null, network: networksBySlug[scraperSlug] || sitesBySlug[scraperSlug]?.network || null,
entity: entitiesBySlug[scraperSlug] || null,
scraper: scraperSlug, scraper: scraperSlug,
}; };
const label = context.entity?.name; const label = context.site?.name || context.network?.name;
if (!scraper?.fetchProfile) { if (!scraper?.fetchProfile) {
logger.warn(`No profile profile scraper available for ${scraperSlug}`); logger.warn(`No profile profile scraper available for ${scraperSlug}`);
throw new Error(`No profile profile scraper available for ${scraperSlug}`); throw new Error(`No profile profile scraper available for ${scraperSlug}`);
} }
if (!context.entity) { if (!context.site && !context.network) {
logger.warn(`No entity found for ${scraperSlug}`); logger.warn(`No site or network found for ${scraperSlug}`);
throw new Error(`No entity found for ${scraperSlug}`); throw new Error(`No site or network found for ${scraperSlug}`);
} }
const existingProfile = existingProfilesByActorEntityId[actor.id]?.[context.entity?.id || null]; const existingProfile = existingProfilesByActorNetworkSiteId[actor.id]?.[context.network?.id || null]?.[context.site?.id || null];
if (existingProfile && !argv.force) { if (existingProfile && !argv.force) {
logger.verbose(`Found existing profile for '${actor.name}' on '${label}', use --force to scrape again`); logger.verbose(`Found existing profile for '${actor.name}' on '${label}', use --force to scrape again`);
@ -572,14 +574,17 @@ async function scrapeActors(actorNames) {
const baseActors = toBaseActors(actorNames); const baseActors = toBaseActors(actorNames);
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors); const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
const entitySlugs = sources.flat(); const siteSlugs = sources.flat();
const [entities, existingActorEntries] = await Promise.all([ const [networks, sites, existingActorEntries] = await Promise.all([
knex('entities') knex('networks').whereIn('slug', siteSlugs),
.select(knex.raw('entities.*, row_to_json(parents) as parent')) knex('sites')
.whereIn('entities.slug', entitySlugs) .select(
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id') 'sites.*',
.orderBy('entities.type'), 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
)
.whereIn('sites.slug', siteSlugs)
.leftJoin('networks', 'sites.network_id', 'networks.id'),
knex('actors') knex('actors')
.select(['id', 'name', 'slug']) .select(['id', 'name', 'slug'])
.modify((queryBuilder) => { .modify((queryBuilder) => {
@ -590,7 +595,8 @@ async function scrapeActors(actorNames) {
.whereNull('alias_for'), .whereNull('alias_for'),
]); ]);
const entitiesBySlug = entities.reduce((acc, entity) => ({ ...acc, [entity.slug]: entity }), {}); const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: network }), {});
const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: curateSite(site) }), {});
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {}); const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]); const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
@ -602,17 +608,20 @@ async function scrapeActors(actorNames) {
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []); const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
const existingProfiles = await knex('actors_profiles').whereIn('actor_id', actors.map(actor => actor.id)); const existingProfiles = await knex('actors_profiles').whereIn('actor_id', actors.map(actor => actor.id));
const existingProfilesByActorEntityId = existingProfiles.reduce((acc, profile) => ({ const existingProfilesByActorNetworkSiteId = existingProfiles.reduce((acc, profile) => ({
...acc, ...acc,
[profile.actor_id]: { [profile.actor_id]: {
...acc[profile.actor_id], ...acc[profile.actor_id],
[profile.entity_id]: profile, [profile.network_id]: {
...acc[profile.network_id],
[profile.site_id]: profile,
},
}, },
}), {}); }), {});
const profilesPerActor = await Promise.map( const profilesPerActor = await Promise.map(
actors, actors,
async actor => scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId), async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId),
{ concurrency: 10 }, { concurrency: 10 },
); );
@ -636,24 +645,24 @@ async function scrapeActors(actorNames) {
async function getOrCreateActors(baseActors, batchId) { async function getOrCreateActors(baseActors, batchId) {
const existingActors = await knex('actors') const existingActors = await knex('actors')
.select('id', 'alias_for', 'name', 'slug', 'entity_id') .select('id', 'alias_for', 'name', 'slug', 'network_id')
.whereIn('slug', baseActors.map(baseActor => baseActor.slug)) .whereIn('slug', baseActors.map(baseActor => baseActor.slug))
.whereNull('entity_id') .whereNull('network_id')
.orWhereIn(['slug', 'entity_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.entity.id])); .orWhereIn(['slug', 'network_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.network.id]));
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug)); // const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
const existingActorSlugs = existingActors.reduce((acc, actor) => ({ const existingActorSlugs = existingActors.reduce((acc, actor) => ({
...acc, ...acc,
[actor.entity_id]: { [actor.network_id]: {
...acc[actor.entity_id], ...acc[actor.network_id],
[actor.slug]: true, [actor.slug]: true,
}, },
}), {}); }), {});
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]); const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.network.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]);
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId); const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
const newActors = await knex('actors').insert(curatedActorEntries, ['id', 'alias_for', 'name', 'slug', 'entity_id']); const newActors = await knex('actors').insert(curatedActorEntries, ['id', 'alias_for', 'name', 'slug', 'network_id']);
if (Array.isArray(newActors)) { if (Array.isArray(newActors)) {
return newActors.concat(existingActors); return newActors.concat(existingActors);
@ -708,7 +717,7 @@ async function fetchActor(actorId) {
const actor = await knex('actors') const actor = await knex('actors')
.select(knex.raw(` .select(knex.raw(`
actors.*, actors.*,
row_to_json(entities) as entity, row_to_json(networks) as network,
row_to_json(actor_alias) as alias, row_to_json(actor_alias) as alias,
row_to_json(birth_country) as birth_country, row_to_json(birth_country) as birth_country,
row_to_json(residence_country) as residence_country, row_to_json(residence_country) as residence_country,
@ -723,7 +732,7 @@ async function fetchActor(actorId) {
queryBuilder.where('actors.id', actorId); queryBuilder.where('actors.id', actorId);
}) })
.leftJoin('actors as actor_alias', 'actor_alias.id', 'actors.alias_for') .leftJoin('actors as actor_alias', 'actor_alias.id', 'actors.alias_for')
.leftJoin('entities', 'entities.id', 'actors.entity_id') .leftJoin('networks', 'networks.id', 'actors.network_id')
.leftJoin('countries as birth_country', 'birth_country.alpha2', 'actors.birth_country_alpha2') .leftJoin('countries as birth_country', 'birth_country.alpha2', 'actors.birth_country_alpha2')
.leftJoin('countries as residence_country', 'residence_country.alpha2', 'actors.residence_country_alpha2') .leftJoin('countries as residence_country', 'residence_country.alpha2', 'actors.residence_country_alpha2')
.leftJoin('media', 'media.id', 'actors.avatar_media_id') .leftJoin('media', 'media.id', 'actors.avatar_media_id')

View File

@ -7,6 +7,8 @@ const include = require('./utils/argv-include')(argv);
const logger = require('./logger')(__filename); const logger = require('./logger')(__filename);
const knex = require('./knex'); const knex = require('./knex');
const scrapers = require('./scrapers/scrapers'); const scrapers = require('./scrapers/scrapers');
const { curateSites } = require('./sites');
const { curateNetworks } = require('./networks');
function urlToSiteSlug(url) { function urlToSiteSlug(url) {
try { try {
@ -17,31 +19,37 @@ function urlToSiteSlug(url) {
return slug; return slug;
} catch (error) { } catch (error) {
logger.warn(`Failed to derive entity slug from '${url}': ${error.message}`); logger.warn(`Failed to derive site slug from '${url}': ${error.message}`);
return null; return null;
} }
} }
async function findEntities(baseReleases) { async function findSites(baseReleases) {
const baseReleasesWithoutEntity = baseReleases.filter(release => release.url && !release.site && !release.entity); const baseReleasesWithoutSite = baseReleases.filter(release => release.url && !release.site);
const entitySlugs = Array.from(new Set( const siteSlugs = Array.from(new Set(
baseReleasesWithoutEntity baseReleasesWithoutSite
.map(baseRelease => urlToSiteSlug(baseRelease.url)) .map(baseRelease => urlToSiteSlug(baseRelease.url))
.filter(Boolean), .filter(Boolean),
)); ));
const entities = await knex('entities') const siteEntries = await knex('sites')
.select(knex.raw('entities.*, row_to_json(parents) as parent')) .leftJoin('networks', 'networks.id', 'sites.network_id')
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id') .select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.parameters as network_parameters', 'networks.description as network_description')
.whereIn('entities.slug', entitySlugs) .whereIn('sites.slug', siteSlugs);
.orderBy('entities.type', 'asc');
// channel entity will overwrite network entity const networkEntries = await knex('networks').whereIn('slug', siteSlugs);
const entitiesBySlug = entities.reduce((accEntities, entity) => ({ ...accEntities, [entity.slug]: entity }), {});
return entitiesBySlug; const sites = await curateSites(siteEntries, true, false);
const networks = await curateNetworks(networkEntries, true, false, false);
const markedNetworks = networks.map(network => ({ ...network, isNetwork: true }));
const sitesBySlug = []
.concat(markedNetworks, sites)
.reduce((accSites, site) => ({ ...accSites, [site.slug]: site }), {});
return sitesBySlug;
} }
function toBaseReleases(baseReleasesOrUrls) { function toBaseReleases(baseReleasesOrUrls) {
@ -81,22 +89,23 @@ function toBaseReleases(baseReleasesOrUrls) {
.filter(Boolean); .filter(Boolean);
} }
async function scrapeRelease(baseRelease, entities, type = 'scene') { async function scrapeRelease(baseRelease, sites, type = 'scene') {
const entity = baseRelease.entity || baseRelease.site || entities[urlToSiteSlug(baseRelease.url)]; const site = baseRelease.site || sites[urlToSiteSlug(baseRelease.url)];
if (!entity) { if (!site) {
logger.warn(`No entity available for ${baseRelease.url}`); logger.warn(`No site available for ${baseRelease.url}`);
return baseRelease; return baseRelease;
} }
if ((!baseRelease.url && !baseRelease.path) || !argv.deep) { if ((!baseRelease.url && !baseRelease.path) || !argv.deep) {
return { return {
...baseRelease, ...baseRelease,
entity, site,
}; };
} }
const scraper = scrapers.releases[entity.slug] || scrapers.releases[entity.parent?.slug]; const siteWithFallbackNetwork = site.isNetwork ? { ...site, network: site } : site; // make site.network available, even when site is network fallback
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
if (!scraper) { if (!scraper) {
logger.warn(`Could not find scraper for ${baseRelease.url}`); logger.warn(`Could not find scraper for ${baseRelease.url}`);
@ -104,7 +113,7 @@ async function scrapeRelease(baseRelease, entities, type = 'scene') {
} }
if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) { if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) {
logger.warn(`The '${entity.name}'-scraper cannot fetch individual ${type}s`); logger.warn(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
return baseRelease; return baseRelease;
} }
@ -112,14 +121,14 @@ async function scrapeRelease(baseRelease, entities, type = 'scene') {
logger.verbose(`Fetching ${type} ${baseRelease.url}`); logger.verbose(`Fetching ${type} ${baseRelease.url}`);
const scrapedRelease = type === 'scene' const scrapedRelease = type === 'scene'
? await scraper.fetchScene(baseRelease.url, entity, baseRelease, null, include) ? await scraper.fetchScene(baseRelease.url, siteWithFallbackNetwork, baseRelease, null, include)
: await scraper.fetchMovie(baseRelease.url, entity, baseRelease, null, include); : await scraper.fetchMovie(baseRelease.url, siteWithFallbackNetwork, baseRelease, null, include);
const mergedRelease = { const mergedRelease = {
...baseRelease, ...baseRelease,
...scrapedRelease, ...scrapedRelease,
deep: !!scrapedRelease, deep: !!scrapedRelease,
entity, site,
}; };
if (!mergedRelease.entryId) { if (!mergedRelease.entryId) {
@ -143,19 +152,19 @@ async function scrapeRelease(baseRelease, entities, type = 'scene') {
} }
} }
async function scrapeReleases(baseReleases, entities, type) { async function scrapeReleases(baseReleases, sites, type) {
return Promise.map( return Promise.map(
baseReleases, baseReleases,
async baseRelease => scrapeRelease(baseRelease, entities, type), async baseRelease => scrapeRelease(baseRelease, sites, type),
{ concurrency: 10 }, { concurrency: 10 },
); );
} }
async function fetchReleases(baseReleasesOrUrls, type = 'scene') { async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
const baseReleases = toBaseReleases(baseReleasesOrUrls); const baseReleases = toBaseReleases(baseReleasesOrUrls);
const entities = await findEntities(baseReleases); const sites = await findSites(baseReleases);
const deepReleases = await scrapeReleases(baseReleases, entities, type); const deepReleases = await scrapeReleases(baseReleases, sites, type);
return deepReleases.filter(Boolean); return deepReleases.filter(Boolean);
} }

View File

@ -17,10 +17,7 @@ function curateEntity(entity, includeParameters = false) {
type: entity.type, type: entity.type,
parameters: includeParameters ? entity.parameters : null, parameters: includeParameters ? entity.parameters : null,
parent: entity.parent, parent: entity.parent,
children: (entity.children || []).map(child => curateEntity({ children: (entity.children || []).map(child => curateEntity(child)),
...child,
parent: entity,
})),
}; };
return curatedEntity; return curatedEntity;
@ -31,8 +28,7 @@ async function curateEntities(entities, includeParameters) {
} }
async function fetchSitesFromArgv() { async function fetchSitesFromArgv() {
const rawNetworks = await knex.raw(` const rawEntities = await knex.raw(`
/* networks from argument with sites as children */
WITH RECURSIVE temp AS ( WITH RECURSIVE temp AS (
SELECT SELECT
id, parent_id, name, slug, type, url, description, parameters id, parent_id, name, slug, type, url, description, parameters
@ -61,10 +57,8 @@ async function fetchSitesFromArgv() {
GROUP BY GROUP BY
temp.parent_id, entities.id, entities.name, parents.id temp.parent_id, entities.id, entities.name, parents.id
UNION ALL UNION ALL
/* sites from argument as the child of network with parent */
SELECT SELECT
entities.*, row_to_json(parents) as parent, json_agg(row_to_json(children)) entities.*, row_to_json(parents) as parent, json_build_array(row_to_json(children))
FROM FROM
entities AS children entities AS children
LEFT JOIN LEFT JOIN
@ -74,13 +68,15 @@ async function fetchSitesFromArgv() {
WHERE WHERE
children.slug = ANY(?) AND children.type = 2 children.slug = ANY(?) AND children.type = 2
GROUP BY GROUP BY
entities.id, parents.id; entities.id, parents.id, children.id;
`, [argv.networks || [], argv.sites || []]); `, [argv.networks || [], argv.sites || []]);
const curatedNetworks = await curateEntities(rawNetworks.rows, true); const curatedEntities = await curateEntities(rawEntities.rows, true);
logger.info(`Found ${curatedNetworks.length} networks in database`); logger.info(`Found ${curatedEntities.length} entities in database`);
return curatedNetworks; console.log(rawEntities.rows);
return curatedEntities;
} }
async function fetchSitesFromConfig() { async function fetchSitesFromConfig() {

View File

@ -3,21 +3,21 @@
const util = require('util'); const util = require('util');
const knex = require('../knex'); const knex = require('../knex');
const { get, geta, ed, formatDate, ctxa } = require('../utils/q'); const { get, geta, ed, fd, ctxa } = require('../utils/q');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
const { feetInchesToCm } = require('../utils/convert'); const { feetInchesToCm } = require('../utils/convert');
async function getChannelRegExp(site) { async function getChannelRegExp(site) {
if (!['hushpass', 'interracialpass'].includes(site.parent.slug)) return null; if (!['hushpass', 'interracialpass'].includes(site.network.slug)) return null;
const sites = await knex('sites').where('network_id', site.parent.id); const sites = await knex('sites').where('network_id', site.network.id);
return new RegExp(sites.map(channel => channel.parameters?.match || channel.name).join('|'), 'i'); return new RegExp(sites.map(channel => channel.parameters?.match || channel.name).join('|'), 'i');
} }
function deriveEntryId(release) { function deriveEntryId(release) {
if (release.date && release.title) { if (release.date && release.title) {
return `${slugify(formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`; return `${slugify(fd(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`;
} }
return null; return null;
@ -140,7 +140,7 @@ function scrapeScene({ html, qu }, site, url, baseRelease) {
release.title = qu.q('.centerwrap h2', true); release.title = qu.q('.centerwrap h2', true);
release.description = qu.q('.videocontent p', true); release.description = qu.q('.videocontent p', true);
release.date = qu.date('.videodetails .date', ['MM/DD/YYYY', 'YYYY-MM-DD']); release.date = qu.date('.videodetails .date', 'MM/DD/YYYY');
release.duration = qu.dur('.videodetails .date'); release.duration = qu.dur('.videodetails .date');
release.actors = qu.all('.modelname a', true); release.actors = qu.all('.modelname a', true);

View File

@ -8,7 +8,7 @@ const knex = require('./knex');
const slugify = require('./utils/slugify'); const slugify = require('./utils/slugify');
const { associateActors, scrapeActors } = require('./actors'); const { associateActors, scrapeActors } = require('./actors');
const { associateReleaseTags } = require('./tags'); const { associateReleaseTags } = require('./tags');
const { curateEntity } = require('./entities'); const { curateSite } = require('./sites');
const { associateReleaseMedia } = require('./media'); const { associateReleaseMedia } = require('./media');
function curateReleaseEntry(release, batchId, existingRelease) { function curateReleaseEntry(release, batchId, existingRelease) {
@ -20,9 +20,10 @@ function curateReleaseEntry(release, batchId, existingRelease) {
const curatedRelease = { const curatedRelease = {
title: release.title, title: release.title,
entry_id: release.entryId || null, entry_id: release.entryId || null,
entity_id: release.entity.id, site_id: release.site?.id,
studio_id: release.studio?.id || null, network_id: release.site ? null : release.network?.id, // prefer site ID if available
shoot_id: release.shootId || null, shoot_id: release.shootId || null,
studio_id: release.studio?.id || null,
url: release.url, url: release.url,
date: Number(release.date) ? release.date : null, date: Number(release.date) ? release.date : null,
slug, slug,
@ -45,47 +46,52 @@ function curateReleaseEntry(release, batchId, existingRelease) {
return curatedRelease; return curatedRelease;
} }
async function attachChannelEntities(releases) { async function attachChannelSites(releases) {
const releasesWithoutEntity = releases.filter(release => release.channel && !release.entity && release.entity.type !== 1); const releasesWithoutSite = releases.filter(release => release.channel && (!release.site || release.site.isNetwork || release.site.slug !== release.channel));
const channelEntities = await knex('entities') const channelSites = await knex('sites')
.select(knex.raw('entities.*, row_to_json(parents) as parent')) .leftJoin('networks', 'networks.id', 'sites.network_id')
.whereIn('entities.slug', releasesWithoutEntity.map(release => release.channel)) .select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.parameters as network_parameters', 'networks.description as network_description')
.where('entities.type', 2) .whereIn('sites.slug', releasesWithoutSite.map(release => release.channel));
.leftJoin('entities AS parents', 'parents.id', 'entities.parent_id');
const channelEntitiesBySlug = channelEntities.reduce((acc, entity) => ({ ...acc, [entity.slug]: entity }), {}); const channelSitesBySlug = channelSites.reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
const releasesWithChannelEntity = await Promise.all(releases const releasesWithChannelSite = await Promise.all(releases
.map(async (release) => { .map(async (release) => {
if (release.channel && channelEntitiesBySlug[release.channel]) { if (release.channel && channelSitesBySlug[release.channel]) {
const curatedEntity = await curateEntity(channelEntitiesBySlug[release.channel]); const curatedSite = await curateSite(channelSitesBySlug[release.channel]);
return { return {
...release, ...release,
entity: curatedEntity, site: curatedSite,
}; };
} }
if (release.entity) { if (release.site && !release.site.isNetwork) {
return release; return release;
} }
if (release.site && release.site.isNetwork) {
return {
...release,
site: null,
network: release.site,
};
}
logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`); logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`);
return null; return null;
})); }));
return releasesWithChannelEntity.filter(Boolean); return releasesWithChannelSite.filter(Boolean);
} }
async function attachStudios(releases) { async function attachStudios(releases) {
const studioSlugs = releases.map(release => release.studio).filter(Boolean); const studioSlugs = releases.map(release => release.studio).filter(Boolean);
const studios = await knex('entities') const studios = await knex('studios').whereIn('slug', studioSlugs);
.whereIn('slug', studioSlugs)
.where('type', 3);
const studioBySlug = studios.reduce((acc, studio) => ({ ...acc, [studio.slug]: studio }), {}); const studioBySlug = studios.reduce((acc, studio) => ({ ...acc, [studio.slug]: studio }), {});
const releasesWithStudio = releases.map((release) => { const releasesWithStudio = releases.map((release) => {
@ -107,38 +113,38 @@ async function attachStudios(releases) {
} }
function attachReleaseIds(releases, storedReleases) { function attachReleaseIds(releases, storedReleases) {
const storedReleaseIdsByEntityIdAndEntryId = storedReleases.reduce((acc, release) => { const storedReleaseIdsBySiteIdAndEntryId = storedReleases.reduce((acc, release) => {
if (!acc[release.entity_id]) acc[release.entity_id] = {}; if (!acc[release.site_id]) acc[release.site_id] = {};
acc[release.entity_id][release.entry_id] = release.id; acc[release.site_id][release.entry_id] = release.id;
return acc; return acc;
}, {}); }, {});
const releasesWithId = releases.map(release => ({ const releasesWithId = releases.map(release => ({
...release, ...release,
id: storedReleaseIdsByEntityIdAndEntryId[release.entity.id][release.entryId], id: storedReleaseIdsBySiteIdAndEntryId[release.site.id][release.entryId],
})); }));
return releasesWithId; return releasesWithId;
} }
function filterInternalDuplicateReleases(releases) { function filterInternalDuplicateReleases(releases) {
const releasesByEntityIdAndEntryId = releases.reduce((acc, release) => { const releasesBySiteIdAndEntryId = releases.reduce((acc, release) => {
if (!release.entity) { if (!release.site) {
return acc; return acc;
} }
if (!acc[release.entity.id]) { if (!acc[release.site.id]) {
acc[release.entity.id] = {}; acc[release.site.id] = {};
} }
acc[release.entity.id][release.entryId] = release; acc[release.site.id][release.entryId] = release;
return acc; return acc;
}, {}); }, {});
return Object.values(releasesByEntityIdAndEntryId) return Object.values(releasesBySiteIdAndEntryId)
.map(entityReleases => Object.values(entityReleases)) .map(siteReleases => Object.values(siteReleases))
.flat(); .flat();
} }
@ -146,17 +152,17 @@ async function filterDuplicateReleases(releases) {
const internalUniqueReleases = filterInternalDuplicateReleases(releases); const internalUniqueReleases = filterInternalDuplicateReleases(releases);
const duplicateReleaseEntries = await knex('releases') const duplicateReleaseEntries = await knex('releases')
.whereIn(['entry_id', 'entity_id'], internalUniqueReleases.map(release => [release.entryId, release.entity.id])); .whereIn(['entry_id', 'site_id'], internalUniqueReleases.map(release => [release.entryId, release.site.id]));
const duplicateReleasesByEntityIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => { const duplicateReleasesBySiteIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => {
if (!acc[release.entity_id]) acc[release.entity_id] = {}; if (!acc[release.site_id]) acc[release.site_id] = {};
acc[release.entity_id][release.entry_id] = true; acc[release.site_id][release.entry_id] = true;
return acc; return acc;
}, {}); }, {});
const duplicateReleases = internalUniqueReleases.filter(release => duplicateReleasesByEntityIdAndEntryId[release.entity.id]?.[release.entryId]); const duplicateReleases = internalUniqueReleases.filter(release => duplicateReleasesBySiteIdAndEntryId[release.site.id]?.[release.entryId]);
const uniqueReleases = internalUniqueReleases.filter(release => !duplicateReleasesByEntityIdAndEntryId[release.entity.id]?.[release.entryId]); const uniqueReleases = internalUniqueReleases.filter(release => !duplicateReleasesBySiteIdAndEntryId[release.site.id]?.[release.entryId]);
return { return {
uniqueReleases, uniqueReleases,
@ -174,13 +180,13 @@ async function updateReleasesSearch(releaseIds) {
TO_TSVECTOR( TO_TSVECTOR(
'traxxx', 'traxxx',
COALESCE(releases.title, '') || ' ' || COALESCE(releases.title, '') || ' ' ||
parents.name || ' ' || networks.name || ' ' ||
parents.slug || ' ' || networks.slug || ' ' ||
parents.url || ' ' || networks.url || ' ' ||
entities.name || ' ' || sites.name || ' ' ||
entities.slug || ' ' || sites.slug || ' ' ||
COALESCE(entities.url, '') || ' ' || COALESCE(sites.url, '') || ' ' ||
COALESCE(entities.alias, '') || ' ' || COALESCE(sites.alias, '') || ' ' ||
COALESCE(releases.shoot_id, '') || ' ' || COALESCE(releases.shoot_id, '') || ' ' ||
COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMDD'), '') || ' ' || COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMDD'), '') || ' ' ||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' || STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
@ -188,15 +194,15 @@ async function updateReleasesSearch(releaseIds) {
STRING_AGG(COALESCE(tags_aliases.name, ''), ' ') STRING_AGG(COALESCE(tags_aliases.name, ''), ' ')
) as document ) as document
FROM releases FROM releases
LEFT JOIN entities ON releases.entity_id = entities.id LEFT JOIN sites ON releases.site_id = sites.id
LEFT JOIN entities AS parents ON parents.id = entities.parent_id LEFT JOIN networks ON sites.network_id = networks.id
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id
LEFT JOIN actors ON local_actors.actor_id = actors.id LEFT JOIN actors ON local_actors.actor_id = actors.id
LEFT JOIN tags ON local_tags.tag_id = tags.id AND tags.priority >= 7 LEFT JOIN tags ON local_tags.tag_id = tags.id AND tags.priority >= 7
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for AND tags_aliases.secondary = true LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for AND tags_aliases.secondary = true
${releaseIds ? 'WHERE releases.id = ANY(?)' : ''} ${releaseIds ? 'WHERE releases.id = ANY(?)' : ''}
GROUP BY releases.id, entities.name, entities.slug, entities.alias, entities.url, parents.name, parents.slug, parents.url; GROUP BY releases.id, sites.name, sites.slug, sites.alias, sites.url, networks.name, networks.slug, networks.url;
`, releaseIds && [releaseIds]); `, releaseIds && [releaseIds]);
if (documents.rows?.length > 0) { if (documents.rows?.length > 0) {
@ -212,10 +218,10 @@ async function storeReleases(releases) {
const [batchId] = await knex('batches').insert({ comment: null }).returning('id'); const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
const releasesWithChannels = await attachChannelEntities(releases); const releasesWithSites = await attachChannelSites(releases);
const releasesWithStudios = await attachStudios(releasesWithChannels); const releasesWithStudios = await attachStudios(releasesWithSites);
// uniqueness is entity ID + entry ID, filter uniques after adding entities // uniqueness is site ID + entry ID, filter uniques after adding sites
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios); const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);
const curatedNewReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId)); const curatedNewReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId));

View File

@ -27,27 +27,27 @@ async function matchReleaseTags(releases) {
return tagIdsBySlug; return tagIdsBySlug;
} }
async function getEntityTags(releases) { async function getSiteTags(releases) {
const entityIds = releases.map(release => release.entity.id); const siteIds = releases.map(release => release.site.id);
const entityTags = await knex('entities_tags').whereIn('entity_id', entityIds); const siteTags = await knex('sites_tags').whereIn('site_id', siteIds);
const entityTagIdsByEntityId = entityTags.reduce((acc, entityTag) => { const siteTagIdsBySiteId = siteTags.reduce((acc, siteTag) => {
if (!acc[entityTag.entity_id]) { if (!acc[siteTag.site_id]) {
acc[entityTag.entity_id] = []; acc[siteTag.site_id] = [];
} }
acc[entityTag.entity_id].push(entityTag.tag_id); acc[siteTag.site_id].push(siteTag.tag_id);
return acc; return acc;
}, {}); }, {});
return entityTagIdsByEntityId; return siteTagIdsBySiteId;
} }
function buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntityId) { function buildReleaseTagAssociations(releases, tagIdsBySlug, siteTagIdsBySiteId) {
const tagAssociations = releases const tagAssociations = releases
.map((release) => { .map((release) => {
const entityTagIds = entityTagIdsByEntityId[release.entity.id]; const siteTagIds = siteTagIdsBySiteId[release.site.id];
const releaseTags = release.tags || []; const releaseTags = release.tags || [];
const releaseTagIds = releaseTags.every(tag => typeof tag === 'number') const releaseTagIds = releaseTags.every(tag => typeof tag === 'number')
@ -57,7 +57,7 @@ function buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntit
const tags = [...new Set( const tags = [...new Set(
// filter duplicates and empties // filter duplicates and empties
releaseTagIds releaseTagIds
.concat(entityTagIds) .concat(siteTagIds)
.filter(Boolean), .filter(Boolean),
)] )]
.map(tagId => ({ .map(tagId => ({
@ -94,9 +94,9 @@ async function filterUniqueAssociations(tagAssociations) {
async function associateReleaseTags(releases) { async function associateReleaseTags(releases) {
const tagIdsBySlug = await matchReleaseTags(releases); const tagIdsBySlug = await matchReleaseTags(releases);
const EntityTagIdsByEntityId = await getEntityTags(releases); const siteTagIdsBySiteId = await getSiteTags(releases);
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, EntityTagIdsByEntityId); const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, siteTagIdsBySiteId);
const uniqueAssociations = await filterUniqueAssociations(tagAssociations); const uniqueAssociations = await filterUniqueAssociations(tagAssociations);
await knex('releases_tags').insert(uniqueAssociations); await knex('releases_tags').insert(uniqueAssociations);

View File

@ -30,14 +30,14 @@ async function filterUniqueReleases(latestReleases, accReleases) {
.map(release => [release.site.id, release.entryId]); .map(release => [release.site.id, release.entryId]);
const duplicateReleases = await knex('releases') const duplicateReleases = await knex('releases')
.whereIn(['entity_id', 'entry_id'], latestReleaseIdentifiers); .whereIn(['site_id', 'entry_id'], latestReleaseIdentifiers);
// add entry IDs of accumulated releases to prevent an infinite scrape loop // add entry IDs of accumulated releases to prevent an infinite scrape loop
// when one page contains the same release as the previous // when one page contains the same release as the previous
const duplicateReleasesSiteIdAndEntryIds = duplicateReleases const duplicateReleasesSiteIdAndEntryIds = duplicateReleases
.concat(accReleases) .concat(accReleases)
.reduce((acc, release) => { .reduce((acc, release) => {
const siteId = release.entity_id || release.site.id; const siteId = release.site_id || release.site.id;
const entryId = release.entry_id || release.entryId; const entryId = release.entry_id || release.entryId;
if (!acc[siteId]) acc[siteId] = {}; if (!acc[siteId]) acc[siteId] = {};
@ -85,7 +85,7 @@ async function scrapeReleases(scraper, site, preData, upcoming = false) {
if (!Array.isArray(latestReleases)) { if (!Array.isArray(latestReleases)) {
// scraper is unable to fetch the releases and returned a HTTP code or null // scraper is unable to fetch the releases and returned a HTTP code or null
logger.warn(`Scraper returned ${latestReleases} when fetching latest from '${site.name}' (${site.parent?.name})`); logger.warn(`Scraper returned ${latestReleases} when fetching latest from '${site.name}' (${site.network.name})`);
return accReleases; return accReleases;
} }
@ -102,7 +102,7 @@ async function scrapeReleases(scraper, site, preData, upcoming = false) {
const pageAccReleases = accReleases.concat(uniqueReleases); const pageAccReleases = accReleases.concat(uniqueReleases);
logger.verbose(`Scraped '${site.name}' (${site.parent?.name}) ${upcoming ? 'upcoming' : 'latest'} page ${page}, found ${uniqueReleases.length} unique updates`); logger.verbose(`Scraped '${site.name}' (${site.network.name}) ${upcoming ? 'upcoming' : 'latest'} page ${page}, found ${uniqueReleases.length} unique updates`);
if (needNextPage(uniqueReleases, pageAccReleases)) { if (needNextPage(uniqueReleases, pageAccReleases)) {
return scrapePage(page + 1, pageAccReleases); return scrapePage(page + 1, pageAccReleases);
@ -135,7 +135,7 @@ async function scrapeLatestReleases(scraper, site, preData) {
try { try {
return await scrapeReleases(scraper, site, preData, false); return await scrapeReleases(scraper, site, preData, false);
} catch (error) { } catch (error) {
logger.warn(`Failed to scrape latest updates for '${site.slug}' (${site.parent?.slug}): ${error.message}`); logger.warn(`Failed to scrape latest updates for '${site.slug}' (${site.network.slug}): ${error.message}`);
} }
return []; return [];
@ -149,7 +149,7 @@ async function scrapeUpcomingReleases(scraper, site, preData) {
try { try {
return await scrapeReleases(scraper, site, preData, true); return await scrapeReleases(scraper, site, preData, true);
} catch (error) { } catch (error) {
logger.warn(`Failed to scrape upcoming updates for '${site.slug}' (${site.parent?.slug}): ${error.message}`); logger.warn(`Failed to scrape upcoming updates for '${site.slug}' (${site.network.slug}): ${error.message}`);
} }
return []; return [];
@ -165,18 +165,18 @@ async function scrapeSiteReleases(scraper, site, preData) {
: [], : [],
]); ]);
logger.info(`Fetching ${latestReleases.length} latest and ${upcomingReleases.length} upcoming updates for '${site.name}' (${site.parent.name})`); logger.info(`Fetching ${latestReleases.length} latest and ${upcomingReleases.length} upcoming updates for '${site.name}' (${site.network.name})`);
return [...latestReleases, ...upcomingReleases]; return [...latestReleases, ...upcomingReleases];
} }
async function scrapeSite(site, accSiteReleases) { async function scrapeSite(site, accSiteReleases) {
const scraper = scrapers.releases[site.slug] const scraper = scrapers.releases[site.slug]
|| scrapers.releases[site.parent?.slug] || scrapers.releases[site.network.slug]
|| scrapers.releases[site.parent?.parent?.slug]; || scrapers.releases[site.network.parent?.slug];
if (!scraper) { if (!scraper) {
logger.warn(`No scraper found for '${site.name}' (${site.parent.name})`); logger.warn(`No scraper found for '${site.name}' (${site.network.name})`);
return []; return [];
} }
@ -196,12 +196,12 @@ async function scrapeSite(site, accSiteReleases) {
} }
} }
async function scrapeNetworkSequential(networkEntity) { async function scrapeNetworkSequential(network) {
return Promise.reduce( return Promise.reduce(
networkEntity.children, network.sites,
async (chain, siteEntity) => { async (chain, site) => {
const accSiteReleases = await chain; const accSiteReleases = await chain;
const siteReleases = await scrapeSite(siteEntity, networkEntity, accSiteReleases); const siteReleases = await scrapeSite(site, network, accSiteReleases);
return accSiteReleases.concat(siteReleases); return accSiteReleases.concat(siteReleases);
}, },
@ -209,10 +209,10 @@ async function scrapeNetworkSequential(networkEntity) {
); );
} }
async function scrapeNetworkParallel(networkEntity) { async function scrapeNetworkParallel(network) {
return Promise.map( return Promise.map(
networkEntity.children, network.sites,
async siteEntity => scrapeSite(siteEntity, networkEntity), async site => scrapeSite(site, network),
{ concurrency: 3 }, { concurrency: 3 },
); );
} }
@ -222,6 +222,8 @@ async function fetchUpdates() {
? await fetchSitesFromArgv() ? await fetchSitesFromArgv()
: await fetchSitesFromConfig(); : await fetchSitesFromConfig();
// console.log('included', includedNetworks);
const scrapedNetworks = await Promise.map( const scrapedNetworks = await Promise.map(
includedNetworks, includedNetworks,
async network => (network.parameters?.sequential async network => (network.parameters?.sequential

View File

@ -45,7 +45,7 @@ function slugify(string, delimiter = '-', {
return string; return string;
} }
const slugComponents = string.trim().toLowerCase().match(/[A-Za-zÀ-ÖØ-öø-ÿ0-9]+/g); const slugComponents = string.trim().toLowerCase().match(/[A-Za-zÀ-ÖØ-öø-ÿ]+/g);
if (!slugComponents) { if (!slugComponents) {
return ''; return '';