Compare commits
2 Commits
1907ce1e54
...
4959dfd14f
Author | SHA1 | Date |
---|---|---|
|
4959dfd14f | |
|
f0a89df6ab |
|
@ -45,6 +45,7 @@ async function mounted() {
|
||||||
'double-penetration',
|
'double-penetration',
|
||||||
'facial',
|
'facial',
|
||||||
'creampie',
|
'creampie',
|
||||||
|
'squirting',
|
||||||
],
|
],
|
||||||
appearance: [
|
appearance: [
|
||||||
'asian',
|
'asian',
|
||||||
|
@ -100,6 +101,7 @@ async function mounted() {
|
||||||
],
|
],
|
||||||
misc: [
|
misc: [
|
||||||
'gaping',
|
'gaping',
|
||||||
|
'squirting',
|
||||||
'oil',
|
'oil',
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
|
|
|
@ -57,7 +57,7 @@ function initActorActions(store, _router) {
|
||||||
description
|
description
|
||||||
createdAt
|
createdAt
|
||||||
updatedAt
|
updatedAt
|
||||||
network {
|
network: entity {
|
||||||
id
|
id
|
||||||
name
|
name
|
||||||
slug
|
slug
|
||||||
|
@ -80,12 +80,7 @@ function initActorActions(store, _router) {
|
||||||
profiles: actorsProfiles {
|
profiles: actorsProfiles {
|
||||||
description
|
description
|
||||||
descriptionHash
|
descriptionHash
|
||||||
network {
|
network: entity {
|
||||||
id
|
|
||||||
slug
|
|
||||||
name
|
|
||||||
}
|
|
||||||
site {
|
|
||||||
id
|
id
|
||||||
slug
|
slug
|
||||||
name
|
name
|
||||||
|
@ -162,12 +157,12 @@ function initActorActions(store, _router) {
|
||||||
${releaseActorsFragment}
|
${releaseActorsFragment}
|
||||||
${releaseTagsFragment}
|
${releaseTagsFragment}
|
||||||
${releasePosterFragment}
|
${releasePosterFragment}
|
||||||
site {
|
site: entity {
|
||||||
id
|
id
|
||||||
name
|
name
|
||||||
slug
|
slug
|
||||||
url
|
url
|
||||||
network {
|
network: parent {
|
||||||
id
|
id
|
||||||
name
|
name
|
||||||
slug
|
slug
|
||||||
|
@ -265,7 +260,7 @@ function initActorActions(store, _router) {
|
||||||
dateOfBirth
|
dateOfBirth
|
||||||
dateOfDeath
|
dateOfDeath
|
||||||
gender
|
gender
|
||||||
network {
|
network: entity {
|
||||||
id
|
id
|
||||||
name
|
name
|
||||||
slug
|
slug
|
||||||
|
|
|
@ -1,11 +1,10 @@
|
||||||
const siteFragment = `
|
const siteFragment = `
|
||||||
site {
|
site: entity {
|
||||||
id
|
id
|
||||||
name
|
name
|
||||||
slug
|
slug
|
||||||
url
|
url
|
||||||
independent
|
network: parent {
|
||||||
network {
|
|
||||||
id
|
id
|
||||||
name
|
name
|
||||||
slug
|
slug
|
||||||
|
@ -20,7 +19,6 @@ const sitesFragment = `
|
||||||
name
|
name
|
||||||
slug
|
slug
|
||||||
url
|
url
|
||||||
independent
|
|
||||||
network {
|
network {
|
||||||
id
|
id
|
||||||
name
|
name
|
||||||
|
@ -49,7 +47,7 @@ const actorFields = `
|
||||||
lazy
|
lazy
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
network {
|
network: entity {
|
||||||
id
|
id
|
||||||
name
|
name
|
||||||
slug
|
slug
|
||||||
|
|
|
@ -326,9 +326,9 @@ exports.up = knex => Promise.resolve()
|
||||||
|
|
||||||
table.text('real_name');
|
table.text('real_name');
|
||||||
|
|
||||||
table.integer('network_id', 12)
|
table.integer('entity_id', 12)
|
||||||
.references('id')
|
.references('id')
|
||||||
.inTable('networks');
|
.inTable('entities');
|
||||||
|
|
||||||
table.integer('alias_for', 12)
|
table.integer('alias_for', 12)
|
||||||
.references('id')
|
.references('id')
|
||||||
|
@ -393,15 +393,11 @@ exports.up = knex => Promise.resolve()
|
||||||
.references('id')
|
.references('id')
|
||||||
.inTable('actors');
|
.inTable('actors');
|
||||||
|
|
||||||
table.integer('network_id', 12)
|
table.integer('entity_id', 12)
|
||||||
.references('id')
|
.references('id')
|
||||||
.inTable('networks');
|
.inTable('entities');
|
||||||
|
|
||||||
table.integer('site_id', 12)
|
table.unique(['actor_id', 'entity_id']);
|
||||||
.references('id')
|
|
||||||
.inTable('sites');
|
|
||||||
|
|
||||||
table.unique(['actor_id', 'network_id', 'site_id']);
|
|
||||||
table.integer('priority', 4)
|
table.integer('priority', 4)
|
||||||
.defaultTo(1);
|
.defaultTo(1);
|
||||||
|
|
||||||
|
@ -680,13 +676,10 @@ exports.up = knex => Promise.resolve()
|
||||||
.then(() => knex.schema.createTable('releases', (table) => {
|
.then(() => knex.schema.createTable('releases', (table) => {
|
||||||
table.increments('id', 16);
|
table.increments('id', 16);
|
||||||
|
|
||||||
table.integer('site_id', 12)
|
table.integer('entity_id', 12)
|
||||||
.references('id')
|
.references('id')
|
||||||
.inTable('sites');
|
.inTable('entities')
|
||||||
|
.notNullable();
|
||||||
table.integer('network_id', 12)
|
|
||||||
.references('id')
|
|
||||||
.inTable('networks');
|
|
||||||
|
|
||||||
table.integer('studio_id', 12)
|
table.integer('studio_id', 12)
|
||||||
.references('id')
|
.references('id')
|
||||||
|
@ -697,7 +690,7 @@ exports.up = knex => Promise.resolve()
|
||||||
|
|
||||||
table.text('shoot_id');
|
table.text('shoot_id');
|
||||||
table.text('entry_id');
|
table.text('entry_id');
|
||||||
table.unique(['site_id', 'network_id', 'entry_id', 'type']);
|
table.unique(['entity_id', 'entry_id', 'type']);
|
||||||
|
|
||||||
table.text('url', 1000);
|
table.text('url', 1000);
|
||||||
table.text('title');
|
table.text('title');
|
||||||
|
@ -856,15 +849,6 @@ exports.up = knex => Promise.resolve()
|
||||||
.then(() => { // eslint-disable-line arrow-body-style
|
.then(() => { // eslint-disable-line arrow-body-style
|
||||||
// allow vim fold
|
// allow vim fold
|
||||||
return knex.raw(`
|
return knex.raw(`
|
||||||
ALTER TABLE releases
|
|
||||||
ADD CONSTRAINT ensure_site_or_network CHECK (site_id IS NOT NULL OR network_id IS NOT NULL);
|
|
||||||
|
|
||||||
ALTER TABLE releases_search
|
|
||||||
ADD COLUMN document tsvector;
|
|
||||||
|
|
||||||
CREATE UNIQUE INDEX unique_actor_slugs_network ON actors (slug, network_id);
|
|
||||||
CREATE UNIQUE INDEX unique_actor_slugs ON actors (slug, (network_id IS NULL));
|
|
||||||
|
|
||||||
CREATE TEXT SEARCH DICTIONARY traxxx_dict (
|
CREATE TEXT SEARCH DICTIONARY traxxx_dict (
|
||||||
TEMPLATE = pg_catalog.simple,
|
TEMPLATE = pg_catalog.simple,
|
||||||
stopwords = traxxx
|
stopwords = traxxx
|
||||||
|
@ -874,6 +858,12 @@ exports.up = knex => Promise.resolve()
|
||||||
COPY = english
|
COPY = english
|
||||||
);
|
);
|
||||||
|
|
||||||
|
ALTER TABLE releases_search
|
||||||
|
ADD COLUMN document tsvector;
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX unique_actor_slugs_network ON actors (slug, entity_id);
|
||||||
|
CREATE UNIQUE INDEX unique_actor_slugs ON actors (slug, (entity_id IS NULL));
|
||||||
|
|
||||||
ALTER TEXT SEARCH CONFIGURATION traxxx
|
ALTER TEXT SEARCH CONFIGURATION traxxx
|
||||||
ALTER MAPPING FOR word, numword, hword, numhword, hword_part, hword_numpart, asciiword, asciihword, hword_asciipart WITH traxxx_dict, simple, english_stem;
|
ALTER MAPPING FOR word, numword, hword, numhword, hword_part, hword_numpart, asciiword, asciihword, hword_asciipart WITH traxxx_dict, simple, english_stem;
|
||||||
|
|
||||||
|
|
Binary file not shown.
After Width: | Height: | Size: 1.8 KiB |
Binary file not shown.
After Width: | Height: | Size: 492 KiB |
Binary file not shown.
After Width: | Height: | Size: 6.8 KiB |
Binary file not shown.
After Width: | Height: | Size: 27 KiB |
|
@ -635,6 +635,7 @@ const tagPosters = [
|
||||||
['piercings', 0, 'Kaegune in "When The Sun Goes Down" for Suicide Girls'],
|
['piercings', 0, 'Kaegune in "When The Sun Goes Down" for Suicide Girls'],
|
||||||
['pussy-eating', 0, 'Kali Roses licking Emily Willis\' pussy in "Peeping On My Neighbor" for Girl Girl'],
|
['pussy-eating', 0, 'Kali Roses licking Emily Willis\' pussy in "Peeping On My Neighbor" for Girl Girl'],
|
||||||
['redhead', 1, 'Lacy Lennon in "Girl Crush" for When Girls Play'],
|
['redhead', 1, 'Lacy Lennon in "Girl Crush" for When Girls Play'],
|
||||||
|
['squirting', 0, 'Veronica Rodriguez in "Hot Latina Squirting" for Jules Jordan'],
|
||||||
['schoolgirl', 1, 'Eliza Ibarra for Brazzers'],
|
['schoolgirl', 1, 'Eliza Ibarra for Brazzers'],
|
||||||
['swallowing', 'poster'],
|
['swallowing', 'poster'],
|
||||||
['teen', 0, 'Eva Elfie in "Fresh New Talent" for Club Seventeen'],
|
['teen', 0, 'Eva Elfie in "Fresh New Talent" for Club Seventeen'],
|
||||||
|
|
|
@ -20,7 +20,6 @@ const logger = require('./logger')(__filename);
|
||||||
|
|
||||||
const { toBaseReleases } = require('./deep');
|
const { toBaseReleases } = require('./deep');
|
||||||
const { associateAvatars } = require('./media');
|
const { associateAvatars } = require('./media');
|
||||||
const { curateSite } = require('./sites');
|
|
||||||
|
|
||||||
const slugify = require('./utils/slugify');
|
const slugify = require('./utils/slugify');
|
||||||
const capitalize = require('./utils/capitalize');
|
const capitalize = require('./utils/capitalize');
|
||||||
|
@ -120,7 +119,7 @@ function toBaseActors(actorsOrNames, release) {
|
||||||
const baseActor = {
|
const baseActor = {
|
||||||
name,
|
name,
|
||||||
slug,
|
slug,
|
||||||
network: release?.site.network,
|
entity: release?.site?.network || release?.entity?.parent || null,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (actorOrName.name) {
|
if (actorOrName.name) {
|
||||||
|
@ -144,7 +143,7 @@ function curateActor(actor, withDetails = false) {
|
||||||
name: actor.name,
|
name: actor.name,
|
||||||
slug: actor.slug,
|
slug: actor.slug,
|
||||||
gender: actor.gender,
|
gender: actor.gender,
|
||||||
networkId: actor.network_id,
|
entityId: actor.entity_id,
|
||||||
aliasFor: actor.alias_for,
|
aliasFor: actor.alias_for,
|
||||||
dateOfBirth: actor.date_of_birth,
|
dateOfBirth: actor.date_of_birth,
|
||||||
birthCountry: actor.birth_country_alpha2,
|
birthCountry: actor.birth_country_alpha2,
|
||||||
|
@ -155,10 +154,10 @@ function curateActor(actor, withDetails = false) {
|
||||||
slug: actor.slug,
|
slug: actor.slug,
|
||||||
gender: actor.alias.gender,
|
gender: actor.alias.gender,
|
||||||
},
|
},
|
||||||
network: actor.network && {
|
entity: actor.entity && {
|
||||||
id: actor.network.id,
|
id: actor.entity.id,
|
||||||
name: actor.network.name,
|
name: actor.entity.name,
|
||||||
slug: actor.network.slug,
|
slug: actor.entity.slug,
|
||||||
},
|
},
|
||||||
dateOfDeath: actor.date_of_death,
|
dateOfDeath: actor.date_of_death,
|
||||||
cup: actor.cup,
|
cup: actor.cup,
|
||||||
|
@ -211,7 +210,7 @@ function curateActorEntry(baseActor, batchId) {
|
||||||
return {
|
return {
|
||||||
name: baseActor.name,
|
name: baseActor.name,
|
||||||
slug: baseActor.slug,
|
slug: baseActor.slug,
|
||||||
network_id: null,
|
entity_id: null,
|
||||||
batch_id: batchId,
|
batch_id: batchId,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -224,8 +223,7 @@ function curateProfileEntry(profile) {
|
||||||
const curatedProfileEntry = {
|
const curatedProfileEntry = {
|
||||||
...(profile.update !== false && { id: profile.update }),
|
...(profile.update !== false && { id: profile.update }),
|
||||||
actor_id: profile.id,
|
actor_id: profile.id,
|
||||||
site_id: profile.site?.id || null,
|
entity_id: profile.entity?.id || null,
|
||||||
network_id: profile.network?.id || null,
|
|
||||||
date_of_birth: profile.dateOfBirth,
|
date_of_birth: profile.dateOfBirth,
|
||||||
date_of_death: profile.dateOfDeath,
|
date_of_death: profile.dateOfDeath,
|
||||||
gender: profile.gender,
|
gender: profile.gender,
|
||||||
|
@ -268,8 +266,7 @@ async function curateProfile(profile) {
|
||||||
name: profile.name,
|
name: profile.name,
|
||||||
avatar: profile.avatar,
|
avatar: profile.avatar,
|
||||||
scraper: profile.scraper,
|
scraper: profile.scraper,
|
||||||
site: profile.site,
|
entity: profile.entity,
|
||||||
network: profile.network,
|
|
||||||
update: profile.update,
|
update: profile.update,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -343,7 +340,7 @@ async function curateProfile(profile) {
|
||||||
const { href } = new URL(social);
|
const { href } = new URL(social);
|
||||||
return href;
|
return href;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.warn(`Profile scraper for '${profile.site.name}' returned invalid social link: ${social}`);
|
logger.warn(`Profile scraper for '${profile.entity.name}' returned invalid social link: ${social}`);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}).filter(Boolean)
|
}).filter(Boolean)
|
||||||
|
@ -351,9 +348,9 @@ async function curateProfile(profile) {
|
||||||
|
|
||||||
curatedProfile.releases = toBaseReleases(profile.releases);
|
curatedProfile.releases = toBaseReleases(profile.releases);
|
||||||
|
|
||||||
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.ethnicity}`);
|
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`);
|
||||||
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.hairColor || profile.hair}`);
|
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`);
|
||||||
if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.eyes}`);
|
if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.entity.name}' scraper: ${profile.eyes}`);
|
||||||
|
|
||||||
return curatedProfile;
|
return curatedProfile;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -499,7 +496,7 @@ async function upsertProfiles(profiles) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId) {
|
async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId) {
|
||||||
const profiles = Promise.map(sources, async (source) => {
|
const profiles = Promise.map(sources, async (source) => {
|
||||||
try {
|
try {
|
||||||
// config may group sources to try until success
|
// config may group sources to try until success
|
||||||
|
@ -507,24 +504,25 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, exist
|
||||||
try {
|
try {
|
||||||
const scraper = scrapers[scraperSlug];
|
const scraper = scrapers[scraperSlug];
|
||||||
const context = {
|
const context = {
|
||||||
site: sitesBySlug[scraperSlug] || null,
|
site: entitiesBySlug[scraperSlug] || null,
|
||||||
network: networksBySlug[scraperSlug] || sitesBySlug[scraperSlug]?.network || null,
|
network: entitiesBySlug[scraperSlug] || null,
|
||||||
|
entity: entitiesBySlug[scraperSlug] || null,
|
||||||
scraper: scraperSlug,
|
scraper: scraperSlug,
|
||||||
};
|
};
|
||||||
|
|
||||||
const label = context.site?.name || context.network?.name;
|
const label = context.entity?.name;
|
||||||
|
|
||||||
if (!scraper?.fetchProfile) {
|
if (!scraper?.fetchProfile) {
|
||||||
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
||||||
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
|
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!context.site && !context.network) {
|
if (!context.entity) {
|
||||||
logger.warn(`No site or network found for ${scraperSlug}`);
|
logger.warn(`No entity found for ${scraperSlug}`);
|
||||||
throw new Error(`No site or network found for ${scraperSlug}`);
|
throw new Error(`No entity found for ${scraperSlug}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const existingProfile = existingProfilesByActorNetworkSiteId[actor.id]?.[context.network?.id || null]?.[context.site?.id || null];
|
const existingProfile = existingProfilesByActorEntityId[actor.id]?.[context.entity?.id || null];
|
||||||
|
|
||||||
if (existingProfile && !argv.force) {
|
if (existingProfile && !argv.force) {
|
||||||
logger.verbose(`Found existing profile for '${actor.name}' on '${label}', use --force to scrape again`);
|
logger.verbose(`Found existing profile for '${actor.name}' on '${label}', use --force to scrape again`);
|
||||||
|
@ -574,17 +572,14 @@ async function scrapeActors(actorNames) {
|
||||||
const baseActors = toBaseActors(actorNames);
|
const baseActors = toBaseActors(actorNames);
|
||||||
|
|
||||||
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
|
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
|
||||||
const siteSlugs = sources.flat();
|
const entitySlugs = sources.flat();
|
||||||
|
|
||||||
const [networks, sites, existingActorEntries] = await Promise.all([
|
const [entities, existingActorEntries] = await Promise.all([
|
||||||
knex('networks').whereIn('slug', siteSlugs),
|
knex('entities')
|
||||||
knex('sites')
|
.select(knex.raw('entities.*, row_to_json(parents) as parent'))
|
||||||
.select(
|
.whereIn('entities.slug', entitySlugs)
|
||||||
'sites.*',
|
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id')
|
||||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
.orderBy('entities.type'),
|
||||||
)
|
|
||||||
.whereIn('sites.slug', siteSlugs)
|
|
||||||
.leftJoin('networks', 'sites.network_id', 'networks.id'),
|
|
||||||
knex('actors')
|
knex('actors')
|
||||||
.select(['id', 'name', 'slug'])
|
.select(['id', 'name', 'slug'])
|
||||||
.modify((queryBuilder) => {
|
.modify((queryBuilder) => {
|
||||||
|
@ -595,8 +590,7 @@ async function scrapeActors(actorNames) {
|
||||||
.whereNull('alias_for'),
|
.whereNull('alias_for'),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: network }), {});
|
const entitiesBySlug = entities.reduce((acc, entity) => ({ ...acc, [entity.slug]: entity }), {});
|
||||||
const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: curateSite(site) }), {});
|
|
||||||
|
|
||||||
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
|
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
|
||||||
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
|
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
|
||||||
|
@ -608,20 +602,17 @@ async function scrapeActors(actorNames) {
|
||||||
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
|
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
|
||||||
|
|
||||||
const existingProfiles = await knex('actors_profiles').whereIn('actor_id', actors.map(actor => actor.id));
|
const existingProfiles = await knex('actors_profiles').whereIn('actor_id', actors.map(actor => actor.id));
|
||||||
const existingProfilesByActorNetworkSiteId = existingProfiles.reduce((acc, profile) => ({
|
const existingProfilesByActorEntityId = existingProfiles.reduce((acc, profile) => ({
|
||||||
...acc,
|
...acc,
|
||||||
[profile.actor_id]: {
|
[profile.actor_id]: {
|
||||||
...acc[profile.actor_id],
|
...acc[profile.actor_id],
|
||||||
[profile.network_id]: {
|
[profile.entity_id]: profile,
|
||||||
...acc[profile.network_id],
|
|
||||||
[profile.site_id]: profile,
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
}), {});
|
}), {});
|
||||||
|
|
||||||
const profilesPerActor = await Promise.map(
|
const profilesPerActor = await Promise.map(
|
||||||
actors,
|
actors,
|
||||||
async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId),
|
async actor => scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId),
|
||||||
{ concurrency: 10 },
|
{ concurrency: 10 },
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -645,24 +636,24 @@ async function scrapeActors(actorNames) {
|
||||||
|
|
||||||
async function getOrCreateActors(baseActors, batchId) {
|
async function getOrCreateActors(baseActors, batchId) {
|
||||||
const existingActors = await knex('actors')
|
const existingActors = await knex('actors')
|
||||||
.select('id', 'alias_for', 'name', 'slug', 'network_id')
|
.select('id', 'alias_for', 'name', 'slug', 'entity_id')
|
||||||
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
|
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
|
||||||
.whereNull('network_id')
|
.whereNull('entity_id')
|
||||||
.orWhereIn(['slug', 'network_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.network.id]));
|
.orWhereIn(['slug', 'entity_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.entity.id]));
|
||||||
|
|
||||||
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
|
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
|
||||||
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
|
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
|
||||||
...acc,
|
...acc,
|
||||||
[actor.network_id]: {
|
[actor.entity_id]: {
|
||||||
...acc[actor.network_id],
|
...acc[actor.entity_id],
|
||||||
[actor.slug]: true,
|
[actor.slug]: true,
|
||||||
},
|
},
|
||||||
}), {});
|
}), {});
|
||||||
|
|
||||||
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.network.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]);
|
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]);
|
||||||
|
|
||||||
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
|
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
|
||||||
const newActors = await knex('actors').insert(curatedActorEntries, ['id', 'alias_for', 'name', 'slug', 'network_id']);
|
const newActors = await knex('actors').insert(curatedActorEntries, ['id', 'alias_for', 'name', 'slug', 'entity_id']);
|
||||||
|
|
||||||
if (Array.isArray(newActors)) {
|
if (Array.isArray(newActors)) {
|
||||||
return newActors.concat(existingActors);
|
return newActors.concat(existingActors);
|
||||||
|
@ -717,7 +708,7 @@ async function fetchActor(actorId) {
|
||||||
const actor = await knex('actors')
|
const actor = await knex('actors')
|
||||||
.select(knex.raw(`
|
.select(knex.raw(`
|
||||||
actors.*,
|
actors.*,
|
||||||
row_to_json(networks) as network,
|
row_to_json(entities) as entity,
|
||||||
row_to_json(actor_alias) as alias,
|
row_to_json(actor_alias) as alias,
|
||||||
row_to_json(birth_country) as birth_country,
|
row_to_json(birth_country) as birth_country,
|
||||||
row_to_json(residence_country) as residence_country,
|
row_to_json(residence_country) as residence_country,
|
||||||
|
@ -732,7 +723,7 @@ async function fetchActor(actorId) {
|
||||||
queryBuilder.where('actors.id', actorId);
|
queryBuilder.where('actors.id', actorId);
|
||||||
})
|
})
|
||||||
.leftJoin('actors as actor_alias', 'actor_alias.id', 'actors.alias_for')
|
.leftJoin('actors as actor_alias', 'actor_alias.id', 'actors.alias_for')
|
||||||
.leftJoin('networks', 'networks.id', 'actors.network_id')
|
.leftJoin('entities', 'entities.id', 'actors.entity_id')
|
||||||
.leftJoin('countries as birth_country', 'birth_country.alpha2', 'actors.birth_country_alpha2')
|
.leftJoin('countries as birth_country', 'birth_country.alpha2', 'actors.birth_country_alpha2')
|
||||||
.leftJoin('countries as residence_country', 'residence_country.alpha2', 'actors.residence_country_alpha2')
|
.leftJoin('countries as residence_country', 'residence_country.alpha2', 'actors.residence_country_alpha2')
|
||||||
.leftJoin('media', 'media.id', 'actors.avatar_media_id')
|
.leftJoin('media', 'media.id', 'actors.avatar_media_id')
|
||||||
|
|
63
src/deep.js
63
src/deep.js
|
@ -7,8 +7,6 @@ const include = require('./utils/argv-include')(argv);
|
||||||
const logger = require('./logger')(__filename);
|
const logger = require('./logger')(__filename);
|
||||||
const knex = require('./knex');
|
const knex = require('./knex');
|
||||||
const scrapers = require('./scrapers/scrapers');
|
const scrapers = require('./scrapers/scrapers');
|
||||||
const { curateSites } = require('./sites');
|
|
||||||
const { curateNetworks } = require('./networks');
|
|
||||||
|
|
||||||
function urlToSiteSlug(url) {
|
function urlToSiteSlug(url) {
|
||||||
try {
|
try {
|
||||||
|
@ -19,37 +17,31 @@ function urlToSiteSlug(url) {
|
||||||
|
|
||||||
return slug;
|
return slug;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.warn(`Failed to derive site slug from '${url}': ${error.message}`);
|
logger.warn(`Failed to derive entity slug from '${url}': ${error.message}`);
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function findSites(baseReleases) {
|
async function findEntities(baseReleases) {
|
||||||
const baseReleasesWithoutSite = baseReleases.filter(release => release.url && !release.site);
|
const baseReleasesWithoutEntity = baseReleases.filter(release => release.url && !release.site && !release.entity);
|
||||||
|
|
||||||
const siteSlugs = Array.from(new Set(
|
const entitySlugs = Array.from(new Set(
|
||||||
baseReleasesWithoutSite
|
baseReleasesWithoutEntity
|
||||||
.map(baseRelease => urlToSiteSlug(baseRelease.url))
|
.map(baseRelease => urlToSiteSlug(baseRelease.url))
|
||||||
.filter(Boolean),
|
.filter(Boolean),
|
||||||
));
|
));
|
||||||
|
|
||||||
const siteEntries = await knex('sites')
|
const entities = await knex('entities')
|
||||||
.leftJoin('networks', 'networks.id', 'sites.network_id')
|
.select(knex.raw('entities.*, row_to_json(parents) as parent'))
|
||||||
.select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.parameters as network_parameters', 'networks.description as network_description')
|
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id')
|
||||||
.whereIn('sites.slug', siteSlugs);
|
.whereIn('entities.slug', entitySlugs)
|
||||||
|
.orderBy('entities.type', 'asc');
|
||||||
|
|
||||||
const networkEntries = await knex('networks').whereIn('slug', siteSlugs);
|
// channel entity will overwrite network entity
|
||||||
|
const entitiesBySlug = entities.reduce((accEntities, entity) => ({ ...accEntities, [entity.slug]: entity }), {});
|
||||||
|
|
||||||
const sites = await curateSites(siteEntries, true, false);
|
return entitiesBySlug;
|
||||||
const networks = await curateNetworks(networkEntries, true, false, false);
|
|
||||||
const markedNetworks = networks.map(network => ({ ...network, isNetwork: true }));
|
|
||||||
|
|
||||||
const sitesBySlug = []
|
|
||||||
.concat(markedNetworks, sites)
|
|
||||||
.reduce((accSites, site) => ({ ...accSites, [site.slug]: site }), {});
|
|
||||||
|
|
||||||
return sitesBySlug;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function toBaseReleases(baseReleasesOrUrls) {
|
function toBaseReleases(baseReleasesOrUrls) {
|
||||||
|
@ -89,23 +81,22 @@ function toBaseReleases(baseReleasesOrUrls) {
|
||||||
.filter(Boolean);
|
.filter(Boolean);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeRelease(baseRelease, sites, type = 'scene') {
|
async function scrapeRelease(baseRelease, entities, type = 'scene') {
|
||||||
const site = baseRelease.site || sites[urlToSiteSlug(baseRelease.url)];
|
const entity = baseRelease.entity || baseRelease.site || entities[urlToSiteSlug(baseRelease.url)];
|
||||||
|
|
||||||
if (!site) {
|
if (!entity) {
|
||||||
logger.warn(`No site available for ${baseRelease.url}`);
|
logger.warn(`No entity available for ${baseRelease.url}`);
|
||||||
return baseRelease;
|
return baseRelease;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((!baseRelease.url && !baseRelease.path) || !argv.deep) {
|
if ((!baseRelease.url && !baseRelease.path) || !argv.deep) {
|
||||||
return {
|
return {
|
||||||
...baseRelease,
|
...baseRelease,
|
||||||
site,
|
entity,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const siteWithFallbackNetwork = site.isNetwork ? { ...site, network: site } : site; // make site.network available, even when site is network fallback
|
const scraper = scrapers.releases[entity.slug] || scrapers.releases[entity.parent?.slug];
|
||||||
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
|
|
||||||
|
|
||||||
if (!scraper) {
|
if (!scraper) {
|
||||||
logger.warn(`Could not find scraper for ${baseRelease.url}`);
|
logger.warn(`Could not find scraper for ${baseRelease.url}`);
|
||||||
|
@ -113,7 +104,7 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') {
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) {
|
if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) {
|
||||||
logger.warn(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
|
logger.warn(`The '${entity.name}'-scraper cannot fetch individual ${type}s`);
|
||||||
return baseRelease;
|
return baseRelease;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -121,14 +112,14 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') {
|
||||||
logger.verbose(`Fetching ${type} ${baseRelease.url}`);
|
logger.verbose(`Fetching ${type} ${baseRelease.url}`);
|
||||||
|
|
||||||
const scrapedRelease = type === 'scene'
|
const scrapedRelease = type === 'scene'
|
||||||
? await scraper.fetchScene(baseRelease.url, siteWithFallbackNetwork, baseRelease, null, include)
|
? await scraper.fetchScene(baseRelease.url, entity, baseRelease, null, include)
|
||||||
: await scraper.fetchMovie(baseRelease.url, siteWithFallbackNetwork, baseRelease, null, include);
|
: await scraper.fetchMovie(baseRelease.url, entity, baseRelease, null, include);
|
||||||
|
|
||||||
const mergedRelease = {
|
const mergedRelease = {
|
||||||
...baseRelease,
|
...baseRelease,
|
||||||
...scrapedRelease,
|
...scrapedRelease,
|
||||||
deep: !!scrapedRelease,
|
deep: !!scrapedRelease,
|
||||||
site,
|
entity,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!mergedRelease.entryId) {
|
if (!mergedRelease.entryId) {
|
||||||
|
@ -152,19 +143,19 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeReleases(baseReleases, sites, type) {
|
async function scrapeReleases(baseReleases, entities, type) {
|
||||||
return Promise.map(
|
return Promise.map(
|
||||||
baseReleases,
|
baseReleases,
|
||||||
async baseRelease => scrapeRelease(baseRelease, sites, type),
|
async baseRelease => scrapeRelease(baseRelease, entities, type),
|
||||||
{ concurrency: 10 },
|
{ concurrency: 10 },
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
|
async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
|
||||||
const baseReleases = toBaseReleases(baseReleasesOrUrls);
|
const baseReleases = toBaseReleases(baseReleasesOrUrls);
|
||||||
const sites = await findSites(baseReleases);
|
const entities = await findEntities(baseReleases);
|
||||||
|
|
||||||
const deepReleases = await scrapeReleases(baseReleases, sites, type);
|
const deepReleases = await scrapeReleases(baseReleases, entities, type);
|
||||||
|
|
||||||
return deepReleases.filter(Boolean);
|
return deepReleases.filter(Boolean);
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,7 +17,10 @@ function curateEntity(entity, includeParameters = false) {
|
||||||
type: entity.type,
|
type: entity.type,
|
||||||
parameters: includeParameters ? entity.parameters : null,
|
parameters: includeParameters ? entity.parameters : null,
|
||||||
parent: entity.parent,
|
parent: entity.parent,
|
||||||
children: (entity.children || []).map(child => curateEntity(child)),
|
children: (entity.children || []).map(child => curateEntity({
|
||||||
|
...child,
|
||||||
|
parent: entity,
|
||||||
|
})),
|
||||||
};
|
};
|
||||||
|
|
||||||
return curatedEntity;
|
return curatedEntity;
|
||||||
|
@ -28,7 +31,8 @@ async function curateEntities(entities, includeParameters) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchSitesFromArgv() {
|
async function fetchSitesFromArgv() {
|
||||||
const rawEntities = await knex.raw(`
|
const rawNetworks = await knex.raw(`
|
||||||
|
/* networks from argument with sites as children */
|
||||||
WITH RECURSIVE temp AS (
|
WITH RECURSIVE temp AS (
|
||||||
SELECT
|
SELECT
|
||||||
id, parent_id, name, slug, type, url, description, parameters
|
id, parent_id, name, slug, type, url, description, parameters
|
||||||
|
@ -57,8 +61,10 @@ async function fetchSitesFromArgv() {
|
||||||
GROUP BY
|
GROUP BY
|
||||||
temp.parent_id, entities.id, entities.name, parents.id
|
temp.parent_id, entities.id, entities.name, parents.id
|
||||||
UNION ALL
|
UNION ALL
|
||||||
|
|
||||||
|
/* sites from argument as the child of network with parent */
|
||||||
SELECT
|
SELECT
|
||||||
entities.*, row_to_json(parents) as parent, json_build_array(row_to_json(children))
|
entities.*, row_to_json(parents) as parent, json_agg(row_to_json(children))
|
||||||
FROM
|
FROM
|
||||||
entities AS children
|
entities AS children
|
||||||
LEFT JOIN
|
LEFT JOIN
|
||||||
|
@ -68,15 +74,13 @@ async function fetchSitesFromArgv() {
|
||||||
WHERE
|
WHERE
|
||||||
children.slug = ANY(?) AND children.type = 2
|
children.slug = ANY(?) AND children.type = 2
|
||||||
GROUP BY
|
GROUP BY
|
||||||
entities.id, parents.id, children.id;
|
entities.id, parents.id;
|
||||||
`, [argv.networks || [], argv.sites || []]);
|
`, [argv.networks || [], argv.sites || []]);
|
||||||
|
|
||||||
const curatedEntities = await curateEntities(rawEntities.rows, true);
|
const curatedNetworks = await curateEntities(rawNetworks.rows, true);
|
||||||
logger.info(`Found ${curatedEntities.length} entities in database`);
|
logger.info(`Found ${curatedNetworks.length} networks in database`);
|
||||||
|
|
||||||
console.log(rawEntities.rows);
|
return curatedNetworks;
|
||||||
|
|
||||||
return curatedEntities;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchSitesFromConfig() {
|
async function fetchSitesFromConfig() {
|
||||||
|
|
|
@ -3,21 +3,21 @@
|
||||||
const util = require('util');
|
const util = require('util');
|
||||||
|
|
||||||
const knex = require('../knex');
|
const knex = require('../knex');
|
||||||
const { get, geta, ed, fd, ctxa } = require('../utils/q');
|
const { get, geta, ed, formatDate, ctxa } = require('../utils/q');
|
||||||
const slugify = require('../utils/slugify');
|
const slugify = require('../utils/slugify');
|
||||||
const { feetInchesToCm } = require('../utils/convert');
|
const { feetInchesToCm } = require('../utils/convert');
|
||||||
|
|
||||||
async function getChannelRegExp(site) {
|
async function getChannelRegExp(site) {
|
||||||
if (!['hushpass', 'interracialpass'].includes(site.network.slug)) return null;
|
if (!['hushpass', 'interracialpass'].includes(site.parent.slug)) return null;
|
||||||
|
|
||||||
const sites = await knex('sites').where('network_id', site.network.id);
|
const sites = await knex('sites').where('network_id', site.parent.id);
|
||||||
|
|
||||||
return new RegExp(sites.map(channel => channel.parameters?.match || channel.name).join('|'), 'i');
|
return new RegExp(sites.map(channel => channel.parameters?.match || channel.name).join('|'), 'i');
|
||||||
}
|
}
|
||||||
|
|
||||||
function deriveEntryId(release) {
|
function deriveEntryId(release) {
|
||||||
if (release.date && release.title) {
|
if (release.date && release.title) {
|
||||||
return `${slugify(fd(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`;
|
return `${slugify(formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
|
@ -140,7 +140,7 @@ function scrapeScene({ html, qu }, site, url, baseRelease) {
|
||||||
release.title = qu.q('.centerwrap h2', true);
|
release.title = qu.q('.centerwrap h2', true);
|
||||||
release.description = qu.q('.videocontent p', true);
|
release.description = qu.q('.videocontent p', true);
|
||||||
|
|
||||||
release.date = qu.date('.videodetails .date', 'MM/DD/YYYY');
|
release.date = qu.date('.videodetails .date', ['MM/DD/YYYY', 'YYYY-MM-DD']);
|
||||||
release.duration = qu.dur('.videodetails .date');
|
release.duration = qu.dur('.videodetails .date');
|
||||||
|
|
||||||
release.actors = qu.all('.modelname a', true);
|
release.actors = qu.all('.modelname a', true);
|
||||||
|
|
|
@ -8,7 +8,7 @@ const knex = require('./knex');
|
||||||
const slugify = require('./utils/slugify');
|
const slugify = require('./utils/slugify');
|
||||||
const { associateActors, scrapeActors } = require('./actors');
|
const { associateActors, scrapeActors } = require('./actors');
|
||||||
const { associateReleaseTags } = require('./tags');
|
const { associateReleaseTags } = require('./tags');
|
||||||
const { curateSite } = require('./sites');
|
const { curateEntity } = require('./entities');
|
||||||
const { associateReleaseMedia } = require('./media');
|
const { associateReleaseMedia } = require('./media');
|
||||||
|
|
||||||
function curateReleaseEntry(release, batchId, existingRelease) {
|
function curateReleaseEntry(release, batchId, existingRelease) {
|
||||||
|
@ -20,10 +20,9 @@ function curateReleaseEntry(release, batchId, existingRelease) {
|
||||||
const curatedRelease = {
|
const curatedRelease = {
|
||||||
title: release.title,
|
title: release.title,
|
||||||
entry_id: release.entryId || null,
|
entry_id: release.entryId || null,
|
||||||
site_id: release.site?.id,
|
entity_id: release.entity.id,
|
||||||
network_id: release.site ? null : release.network?.id, // prefer site ID if available
|
|
||||||
shoot_id: release.shootId || null,
|
|
||||||
studio_id: release.studio?.id || null,
|
studio_id: release.studio?.id || null,
|
||||||
|
shoot_id: release.shootId || null,
|
||||||
url: release.url,
|
url: release.url,
|
||||||
date: Number(release.date) ? release.date : null,
|
date: Number(release.date) ? release.date : null,
|
||||||
slug,
|
slug,
|
||||||
|
@ -46,52 +45,47 @@ function curateReleaseEntry(release, batchId, existingRelease) {
|
||||||
return curatedRelease;
|
return curatedRelease;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function attachChannelSites(releases) {
|
async function attachChannelEntities(releases) {
|
||||||
const releasesWithoutSite = releases.filter(release => release.channel && (!release.site || release.site.isNetwork || release.site.slug !== release.channel));
|
const releasesWithoutEntity = releases.filter(release => release.channel && !release.entity && release.entity.type !== 1);
|
||||||
|
|
||||||
const channelSites = await knex('sites')
|
const channelEntities = await knex('entities')
|
||||||
.leftJoin('networks', 'networks.id', 'sites.network_id')
|
.select(knex.raw('entities.*, row_to_json(parents) as parent'))
|
||||||
.select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.parameters as network_parameters', 'networks.description as network_description')
|
.whereIn('entities.slug', releasesWithoutEntity.map(release => release.channel))
|
||||||
.whereIn('sites.slug', releasesWithoutSite.map(release => release.channel));
|
.where('entities.type', 2)
|
||||||
|
.leftJoin('entities AS parents', 'parents.id', 'entities.parent_id');
|
||||||
|
|
||||||
const channelSitesBySlug = channelSites.reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
|
const channelEntitiesBySlug = channelEntities.reduce((acc, entity) => ({ ...acc, [entity.slug]: entity }), {});
|
||||||
|
|
||||||
const releasesWithChannelSite = await Promise.all(releases
|
const releasesWithChannelEntity = await Promise.all(releases
|
||||||
.map(async (release) => {
|
.map(async (release) => {
|
||||||
if (release.channel && channelSitesBySlug[release.channel]) {
|
if (release.channel && channelEntitiesBySlug[release.channel]) {
|
||||||
const curatedSite = await curateSite(channelSitesBySlug[release.channel]);
|
const curatedEntity = await curateEntity(channelEntitiesBySlug[release.channel]);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
...release,
|
...release,
|
||||||
site: curatedSite,
|
entity: curatedEntity,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (release.site && !release.site.isNetwork) {
|
if (release.entity) {
|
||||||
return release;
|
return release;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (release.site && release.site.isNetwork) {
|
|
||||||
return {
|
|
||||||
...release,
|
|
||||||
site: null,
|
|
||||||
network: release.site,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`);
|
logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`);
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}));
|
}));
|
||||||
|
|
||||||
return releasesWithChannelSite.filter(Boolean);
|
return releasesWithChannelEntity.filter(Boolean);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function attachStudios(releases) {
|
async function attachStudios(releases) {
|
||||||
const studioSlugs = releases.map(release => release.studio).filter(Boolean);
|
const studioSlugs = releases.map(release => release.studio).filter(Boolean);
|
||||||
|
|
||||||
const studios = await knex('studios').whereIn('slug', studioSlugs);
|
const studios = await knex('entities')
|
||||||
|
.whereIn('slug', studioSlugs)
|
||||||
|
.where('type', 3);
|
||||||
|
|
||||||
const studioBySlug = studios.reduce((acc, studio) => ({ ...acc, [studio.slug]: studio }), {});
|
const studioBySlug = studios.reduce((acc, studio) => ({ ...acc, [studio.slug]: studio }), {});
|
||||||
|
|
||||||
const releasesWithStudio = releases.map((release) => {
|
const releasesWithStudio = releases.map((release) => {
|
||||||
|
@ -113,38 +107,38 @@ async function attachStudios(releases) {
|
||||||
}
|
}
|
||||||
|
|
||||||
function attachReleaseIds(releases, storedReleases) {
|
function attachReleaseIds(releases, storedReleases) {
|
||||||
const storedReleaseIdsBySiteIdAndEntryId = storedReleases.reduce((acc, release) => {
|
const storedReleaseIdsByEntityIdAndEntryId = storedReleases.reduce((acc, release) => {
|
||||||
if (!acc[release.site_id]) acc[release.site_id] = {};
|
if (!acc[release.entity_id]) acc[release.entity_id] = {};
|
||||||
acc[release.site_id][release.entry_id] = release.id;
|
acc[release.entity_id][release.entry_id] = release.id;
|
||||||
|
|
||||||
return acc;
|
return acc;
|
||||||
}, {});
|
}, {});
|
||||||
|
|
||||||
const releasesWithId = releases.map(release => ({
|
const releasesWithId = releases.map(release => ({
|
||||||
...release,
|
...release,
|
||||||
id: storedReleaseIdsBySiteIdAndEntryId[release.site.id][release.entryId],
|
id: storedReleaseIdsByEntityIdAndEntryId[release.entity.id][release.entryId],
|
||||||
}));
|
}));
|
||||||
|
|
||||||
return releasesWithId;
|
return releasesWithId;
|
||||||
}
|
}
|
||||||
|
|
||||||
function filterInternalDuplicateReleases(releases) {
|
function filterInternalDuplicateReleases(releases) {
|
||||||
const releasesBySiteIdAndEntryId = releases.reduce((acc, release) => {
|
const releasesByEntityIdAndEntryId = releases.reduce((acc, release) => {
|
||||||
if (!release.site) {
|
if (!release.entity) {
|
||||||
return acc;
|
return acc;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!acc[release.site.id]) {
|
if (!acc[release.entity.id]) {
|
||||||
acc[release.site.id] = {};
|
acc[release.entity.id] = {};
|
||||||
}
|
}
|
||||||
|
|
||||||
acc[release.site.id][release.entryId] = release;
|
acc[release.entity.id][release.entryId] = release;
|
||||||
|
|
||||||
return acc;
|
return acc;
|
||||||
}, {});
|
}, {});
|
||||||
|
|
||||||
return Object.values(releasesBySiteIdAndEntryId)
|
return Object.values(releasesByEntityIdAndEntryId)
|
||||||
.map(siteReleases => Object.values(siteReleases))
|
.map(entityReleases => Object.values(entityReleases))
|
||||||
.flat();
|
.flat();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -152,17 +146,17 @@ async function filterDuplicateReleases(releases) {
|
||||||
const internalUniqueReleases = filterInternalDuplicateReleases(releases);
|
const internalUniqueReleases = filterInternalDuplicateReleases(releases);
|
||||||
|
|
||||||
const duplicateReleaseEntries = await knex('releases')
|
const duplicateReleaseEntries = await knex('releases')
|
||||||
.whereIn(['entry_id', 'site_id'], internalUniqueReleases.map(release => [release.entryId, release.site.id]));
|
.whereIn(['entry_id', 'entity_id'], internalUniqueReleases.map(release => [release.entryId, release.entity.id]));
|
||||||
|
|
||||||
const duplicateReleasesBySiteIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => {
|
const duplicateReleasesByEntityIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => {
|
||||||
if (!acc[release.site_id]) acc[release.site_id] = {};
|
if (!acc[release.entity_id]) acc[release.entity_id] = {};
|
||||||
acc[release.site_id][release.entry_id] = true;
|
acc[release.entity_id][release.entry_id] = true;
|
||||||
|
|
||||||
return acc;
|
return acc;
|
||||||
}, {});
|
}, {});
|
||||||
|
|
||||||
const duplicateReleases = internalUniqueReleases.filter(release => duplicateReleasesBySiteIdAndEntryId[release.site.id]?.[release.entryId]);
|
const duplicateReleases = internalUniqueReleases.filter(release => duplicateReleasesByEntityIdAndEntryId[release.entity.id]?.[release.entryId]);
|
||||||
const uniqueReleases = internalUniqueReleases.filter(release => !duplicateReleasesBySiteIdAndEntryId[release.site.id]?.[release.entryId]);
|
const uniqueReleases = internalUniqueReleases.filter(release => !duplicateReleasesByEntityIdAndEntryId[release.entity.id]?.[release.entryId]);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
uniqueReleases,
|
uniqueReleases,
|
||||||
|
@ -180,13 +174,13 @@ async function updateReleasesSearch(releaseIds) {
|
||||||
TO_TSVECTOR(
|
TO_TSVECTOR(
|
||||||
'traxxx',
|
'traxxx',
|
||||||
COALESCE(releases.title, '') || ' ' ||
|
COALESCE(releases.title, '') || ' ' ||
|
||||||
networks.name || ' ' ||
|
parents.name || ' ' ||
|
||||||
networks.slug || ' ' ||
|
parents.slug || ' ' ||
|
||||||
networks.url || ' ' ||
|
parents.url || ' ' ||
|
||||||
sites.name || ' ' ||
|
entities.name || ' ' ||
|
||||||
sites.slug || ' ' ||
|
entities.slug || ' ' ||
|
||||||
COALESCE(sites.url, '') || ' ' ||
|
COALESCE(entities.url, '') || ' ' ||
|
||||||
COALESCE(sites.alias, '') || ' ' ||
|
COALESCE(entities.alias, '') || ' ' ||
|
||||||
COALESCE(releases.shoot_id, '') || ' ' ||
|
COALESCE(releases.shoot_id, '') || ' ' ||
|
||||||
COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMDD'), '') || ' ' ||
|
COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMDD'), '') || ' ' ||
|
||||||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
|
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
|
||||||
|
@ -194,15 +188,15 @@ async function updateReleasesSearch(releaseIds) {
|
||||||
STRING_AGG(COALESCE(tags_aliases.name, ''), ' ')
|
STRING_AGG(COALESCE(tags_aliases.name, ''), ' ')
|
||||||
) as document
|
) as document
|
||||||
FROM releases
|
FROM releases
|
||||||
LEFT JOIN sites ON releases.site_id = sites.id
|
LEFT JOIN entities ON releases.entity_id = entities.id
|
||||||
LEFT JOIN networks ON sites.network_id = networks.id
|
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
|
||||||
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id
|
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id
|
||||||
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id
|
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id
|
||||||
LEFT JOIN actors ON local_actors.actor_id = actors.id
|
LEFT JOIN actors ON local_actors.actor_id = actors.id
|
||||||
LEFT JOIN tags ON local_tags.tag_id = tags.id AND tags.priority >= 7
|
LEFT JOIN tags ON local_tags.tag_id = tags.id AND tags.priority >= 7
|
||||||
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for AND tags_aliases.secondary = true
|
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for AND tags_aliases.secondary = true
|
||||||
${releaseIds ? 'WHERE releases.id = ANY(?)' : ''}
|
${releaseIds ? 'WHERE releases.id = ANY(?)' : ''}
|
||||||
GROUP BY releases.id, sites.name, sites.slug, sites.alias, sites.url, networks.name, networks.slug, networks.url;
|
GROUP BY releases.id, entities.name, entities.slug, entities.alias, entities.url, parents.name, parents.slug, parents.url;
|
||||||
`, releaseIds && [releaseIds]);
|
`, releaseIds && [releaseIds]);
|
||||||
|
|
||||||
if (documents.rows?.length > 0) {
|
if (documents.rows?.length > 0) {
|
||||||
|
@ -218,10 +212,10 @@ async function storeReleases(releases) {
|
||||||
|
|
||||||
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
|
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
|
||||||
|
|
||||||
const releasesWithSites = await attachChannelSites(releases);
|
const releasesWithChannels = await attachChannelEntities(releases);
|
||||||
const releasesWithStudios = await attachStudios(releasesWithSites);
|
const releasesWithStudios = await attachStudios(releasesWithChannels);
|
||||||
|
|
||||||
// uniqueness is site ID + entry ID, filter uniques after adding sites
|
// uniqueness is entity ID + entry ID, filter uniques after adding entities
|
||||||
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);
|
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);
|
||||||
|
|
||||||
const curatedNewReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId));
|
const curatedNewReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId));
|
||||||
|
|
26
src/tags.js
26
src/tags.js
|
@ -27,27 +27,27 @@ async function matchReleaseTags(releases) {
|
||||||
return tagIdsBySlug;
|
return tagIdsBySlug;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function getSiteTags(releases) {
|
async function getEntityTags(releases) {
|
||||||
const siteIds = releases.map(release => release.site.id);
|
const entityIds = releases.map(release => release.entity.id);
|
||||||
const siteTags = await knex('sites_tags').whereIn('site_id', siteIds);
|
const entityTags = await knex('entities_tags').whereIn('entity_id', entityIds);
|
||||||
|
|
||||||
const siteTagIdsBySiteId = siteTags.reduce((acc, siteTag) => {
|
const entityTagIdsByEntityId = entityTags.reduce((acc, entityTag) => {
|
||||||
if (!acc[siteTag.site_id]) {
|
if (!acc[entityTag.entity_id]) {
|
||||||
acc[siteTag.site_id] = [];
|
acc[entityTag.entity_id] = [];
|
||||||
}
|
}
|
||||||
|
|
||||||
acc[siteTag.site_id].push(siteTag.tag_id);
|
acc[entityTag.entity_id].push(entityTag.tag_id);
|
||||||
|
|
||||||
return acc;
|
return acc;
|
||||||
}, {});
|
}, {});
|
||||||
|
|
||||||
return siteTagIdsBySiteId;
|
return entityTagIdsByEntityId;
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildReleaseTagAssociations(releases, tagIdsBySlug, siteTagIdsBySiteId) {
|
function buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntityId) {
|
||||||
const tagAssociations = releases
|
const tagAssociations = releases
|
||||||
.map((release) => {
|
.map((release) => {
|
||||||
const siteTagIds = siteTagIdsBySiteId[release.site.id];
|
const entityTagIds = entityTagIdsByEntityId[release.entity.id];
|
||||||
const releaseTags = release.tags || [];
|
const releaseTags = release.tags || [];
|
||||||
|
|
||||||
const releaseTagIds = releaseTags.every(tag => typeof tag === 'number')
|
const releaseTagIds = releaseTags.every(tag => typeof tag === 'number')
|
||||||
|
@ -57,7 +57,7 @@ function buildReleaseTagAssociations(releases, tagIdsBySlug, siteTagIdsBySiteId)
|
||||||
const tags = [...new Set(
|
const tags = [...new Set(
|
||||||
// filter duplicates and empties
|
// filter duplicates and empties
|
||||||
releaseTagIds
|
releaseTagIds
|
||||||
.concat(siteTagIds)
|
.concat(entityTagIds)
|
||||||
.filter(Boolean),
|
.filter(Boolean),
|
||||||
)]
|
)]
|
||||||
.map(tagId => ({
|
.map(tagId => ({
|
||||||
|
@ -94,9 +94,9 @@ async function filterUniqueAssociations(tagAssociations) {
|
||||||
|
|
||||||
async function associateReleaseTags(releases) {
|
async function associateReleaseTags(releases) {
|
||||||
const tagIdsBySlug = await matchReleaseTags(releases);
|
const tagIdsBySlug = await matchReleaseTags(releases);
|
||||||
const siteTagIdsBySiteId = await getSiteTags(releases);
|
const EntityTagIdsByEntityId = await getEntityTags(releases);
|
||||||
|
|
||||||
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, siteTagIdsBySiteId);
|
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, EntityTagIdsByEntityId);
|
||||||
const uniqueAssociations = await filterUniqueAssociations(tagAssociations);
|
const uniqueAssociations = await filterUniqueAssociations(tagAssociations);
|
||||||
|
|
||||||
await knex('releases_tags').insert(uniqueAssociations);
|
await knex('releases_tags').insert(uniqueAssociations);
|
||||||
|
|
|
@ -30,14 +30,14 @@ async function filterUniqueReleases(latestReleases, accReleases) {
|
||||||
.map(release => [release.site.id, release.entryId]);
|
.map(release => [release.site.id, release.entryId]);
|
||||||
|
|
||||||
const duplicateReleases = await knex('releases')
|
const duplicateReleases = await knex('releases')
|
||||||
.whereIn(['site_id', 'entry_id'], latestReleaseIdentifiers);
|
.whereIn(['entity_id', 'entry_id'], latestReleaseIdentifiers);
|
||||||
|
|
||||||
// add entry IDs of accumulated releases to prevent an infinite scrape loop
|
// add entry IDs of accumulated releases to prevent an infinite scrape loop
|
||||||
// when one page contains the same release as the previous
|
// when one page contains the same release as the previous
|
||||||
const duplicateReleasesSiteIdAndEntryIds = duplicateReleases
|
const duplicateReleasesSiteIdAndEntryIds = duplicateReleases
|
||||||
.concat(accReleases)
|
.concat(accReleases)
|
||||||
.reduce((acc, release) => {
|
.reduce((acc, release) => {
|
||||||
const siteId = release.site_id || release.site.id;
|
const siteId = release.entity_id || release.site.id;
|
||||||
const entryId = release.entry_id || release.entryId;
|
const entryId = release.entry_id || release.entryId;
|
||||||
|
|
||||||
if (!acc[siteId]) acc[siteId] = {};
|
if (!acc[siteId]) acc[siteId] = {};
|
||||||
|
@ -85,7 +85,7 @@ async function scrapeReleases(scraper, site, preData, upcoming = false) {
|
||||||
|
|
||||||
if (!Array.isArray(latestReleases)) {
|
if (!Array.isArray(latestReleases)) {
|
||||||
// scraper is unable to fetch the releases and returned a HTTP code or null
|
// scraper is unable to fetch the releases and returned a HTTP code or null
|
||||||
logger.warn(`Scraper returned ${latestReleases} when fetching latest from '${site.name}' (${site.network.name})`);
|
logger.warn(`Scraper returned ${latestReleases} when fetching latest from '${site.name}' (${site.parent?.name})`);
|
||||||
return accReleases;
|
return accReleases;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,7 +102,7 @@ async function scrapeReleases(scraper, site, preData, upcoming = false) {
|
||||||
|
|
||||||
const pageAccReleases = accReleases.concat(uniqueReleases);
|
const pageAccReleases = accReleases.concat(uniqueReleases);
|
||||||
|
|
||||||
logger.verbose(`Scraped '${site.name}' (${site.network.name}) ${upcoming ? 'upcoming' : 'latest'} page ${page}, found ${uniqueReleases.length} unique updates`);
|
logger.verbose(`Scraped '${site.name}' (${site.parent?.name}) ${upcoming ? 'upcoming' : 'latest'} page ${page}, found ${uniqueReleases.length} unique updates`);
|
||||||
|
|
||||||
if (needNextPage(uniqueReleases, pageAccReleases)) {
|
if (needNextPage(uniqueReleases, pageAccReleases)) {
|
||||||
return scrapePage(page + 1, pageAccReleases);
|
return scrapePage(page + 1, pageAccReleases);
|
||||||
|
@ -135,7 +135,7 @@ async function scrapeLatestReleases(scraper, site, preData) {
|
||||||
try {
|
try {
|
||||||
return await scrapeReleases(scraper, site, preData, false);
|
return await scrapeReleases(scraper, site, preData, false);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.warn(`Failed to scrape latest updates for '${site.slug}' (${site.network.slug}): ${error.message}`);
|
logger.warn(`Failed to scrape latest updates for '${site.slug}' (${site.parent?.slug}): ${error.message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return [];
|
return [];
|
||||||
|
@ -149,7 +149,7 @@ async function scrapeUpcomingReleases(scraper, site, preData) {
|
||||||
try {
|
try {
|
||||||
return await scrapeReleases(scraper, site, preData, true);
|
return await scrapeReleases(scraper, site, preData, true);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.warn(`Failed to scrape upcoming updates for '${site.slug}' (${site.network.slug}): ${error.message}`);
|
logger.warn(`Failed to scrape upcoming updates for '${site.slug}' (${site.parent?.slug}): ${error.message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return [];
|
return [];
|
||||||
|
@ -165,18 +165,18 @@ async function scrapeSiteReleases(scraper, site, preData) {
|
||||||
: [],
|
: [],
|
||||||
]);
|
]);
|
||||||
|
|
||||||
logger.info(`Fetching ${latestReleases.length} latest and ${upcomingReleases.length} upcoming updates for '${site.name}' (${site.network.name})`);
|
logger.info(`Fetching ${latestReleases.length} latest and ${upcomingReleases.length} upcoming updates for '${site.name}' (${site.parent.name})`);
|
||||||
|
|
||||||
return [...latestReleases, ...upcomingReleases];
|
return [...latestReleases, ...upcomingReleases];
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeSite(site, accSiteReleases) {
|
async function scrapeSite(site, accSiteReleases) {
|
||||||
const scraper = scrapers.releases[site.slug]
|
const scraper = scrapers.releases[site.slug]
|
||||||
|| scrapers.releases[site.network.slug]
|
|| scrapers.releases[site.parent?.slug]
|
||||||
|| scrapers.releases[site.network.parent?.slug];
|
|| scrapers.releases[site.parent?.parent?.slug];
|
||||||
|
|
||||||
if (!scraper) {
|
if (!scraper) {
|
||||||
logger.warn(`No scraper found for '${site.name}' (${site.network.name})`);
|
logger.warn(`No scraper found for '${site.name}' (${site.parent.name})`);
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -196,12 +196,12 @@ async function scrapeSite(site, accSiteReleases) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeNetworkSequential(network) {
|
async function scrapeNetworkSequential(networkEntity) {
|
||||||
return Promise.reduce(
|
return Promise.reduce(
|
||||||
network.sites,
|
networkEntity.children,
|
||||||
async (chain, site) => {
|
async (chain, siteEntity) => {
|
||||||
const accSiteReleases = await chain;
|
const accSiteReleases = await chain;
|
||||||
const siteReleases = await scrapeSite(site, network, accSiteReleases);
|
const siteReleases = await scrapeSite(siteEntity, networkEntity, accSiteReleases);
|
||||||
|
|
||||||
return accSiteReleases.concat(siteReleases);
|
return accSiteReleases.concat(siteReleases);
|
||||||
},
|
},
|
||||||
|
@ -209,10 +209,10 @@ async function scrapeNetworkSequential(network) {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeNetworkParallel(network) {
|
async function scrapeNetworkParallel(networkEntity) {
|
||||||
return Promise.map(
|
return Promise.map(
|
||||||
network.sites,
|
networkEntity.children,
|
||||||
async site => scrapeSite(site, network),
|
async siteEntity => scrapeSite(siteEntity, networkEntity),
|
||||||
{ concurrency: 3 },
|
{ concurrency: 3 },
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -222,8 +222,6 @@ async function fetchUpdates() {
|
||||||
? await fetchSitesFromArgv()
|
? await fetchSitesFromArgv()
|
||||||
: await fetchSitesFromConfig();
|
: await fetchSitesFromConfig();
|
||||||
|
|
||||||
// console.log('included', includedNetworks);
|
|
||||||
|
|
||||||
const scrapedNetworks = await Promise.map(
|
const scrapedNetworks = await Promise.map(
|
||||||
includedNetworks,
|
includedNetworks,
|
||||||
async network => (network.parameters?.sequential
|
async network => (network.parameters?.sequential
|
||||||
|
|
|
@ -45,7 +45,7 @@ function slugify(string, delimiter = '-', {
|
||||||
return string;
|
return string;
|
||||||
}
|
}
|
||||||
|
|
||||||
const slugComponents = string.trim().toLowerCase().match(/[A-Za-zÀ-ÖØ-öø-ÿ]+/g);
|
const slugComponents = string.trim().toLowerCase().match(/[A-Za-zÀ-ÖØ-öø-ÿ0-9]+/g);
|
||||||
|
|
||||||
if (!slugComponents) {
|
if (!slugComponents) {
|
||||||
return '';
|
return '';
|
||||||
|
|
Loading…
Reference in New Issue