Refactored deep and store modules to use entities.

This commit is contained in:
ThePendulum 2020-06-25 02:26:25 +02:00
parent f0a89df6ab
commit 4959dfd14f
14 changed files with 132 additions and 164 deletions

View File

@ -45,6 +45,7 @@ async function mounted() {
'double-penetration',
'facial',
'creampie',
'squirting',
],
appearance: [
'asian',
@ -100,6 +101,7 @@ async function mounted() {
],
misc: [
'gaping',
'squirting',
'oil',
],
};

View File

@ -57,7 +57,7 @@ function initActorActions(store, _router) {
description
createdAt
updatedAt
network {
network: entity {
id
name
slug
@ -80,12 +80,7 @@ function initActorActions(store, _router) {
profiles: actorsProfiles {
description
descriptionHash
network {
id
slug
name
}
site {
network: entity {
id
slug
name
@ -162,12 +157,12 @@ function initActorActions(store, _router) {
${releaseActorsFragment}
${releaseTagsFragment}
${releasePosterFragment}
site {
site: entity {
id
name
slug
url
network {
network: parent {
id
name
slug
@ -265,7 +260,7 @@ function initActorActions(store, _router) {
dateOfBirth
dateOfDeath
gender
network {
network: entity {
id
name
slug

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 492 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

View File

@ -635,6 +635,7 @@ const tagPosters = [
['piercings', 0, 'Kaegune in "When The Sun Goes Down" for Suicide Girls'],
['pussy-eating', 0, 'Kali Roses licking Emily Willis\' pussy in "Peeping On My Neighbor" for Girl Girl'],
['redhead', 1, 'Lacy Lennon in "Girl Crush" for When Girls Play'],
['squirting', 0, 'Veronica Rodriguez in "Hot Latina Squirting" for Jules Jordan'],
['schoolgirl', 1, 'Eliza Ibarra for Brazzers'],
['swallowing', 'poster'],
['teen', 0, 'Eva Elfie in "Fresh New Talent" for Club Seventeen'],

View File

@ -20,7 +20,6 @@ const logger = require('./logger')(__filename);
const { toBaseReleases } = require('./deep');
const { associateAvatars } = require('./media');
const { curateSite } = require('./sites');
const slugify = require('./utils/slugify');
const capitalize = require('./utils/capitalize');
@ -120,7 +119,7 @@ function toBaseActors(actorsOrNames, release) {
const baseActor = {
name,
slug,
network: release?.site.network,
entity: release?.site?.network || release?.entity?.parent || null,
};
if (actorOrName.name) {
@ -144,7 +143,7 @@ function curateActor(actor, withDetails = false) {
name: actor.name,
slug: actor.slug,
gender: actor.gender,
networkId: actor.entity_id,
entityId: actor.entity_id,
aliasFor: actor.alias_for,
dateOfBirth: actor.date_of_birth,
birthCountry: actor.birth_country_alpha2,
@ -155,10 +154,10 @@ function curateActor(actor, withDetails = false) {
slug: actor.slug,
gender: actor.alias.gender,
},
network: actor.network && {
id: actor.network.id,
name: actor.network.name,
slug: actor.network.slug,
entity: actor.entity && {
id: actor.entity.id,
name: actor.entity.name,
slug: actor.entity.slug,
},
dateOfDeath: actor.date_of_death,
cup: actor.cup,
@ -224,8 +223,7 @@ function curateProfileEntry(profile) {
const curatedProfileEntry = {
...(profile.update !== false && { id: profile.update }),
actor_id: profile.id,
site_id: profile.site?.id || null,
entity_id: profile.network?.id || null,
entity_id: profile.entity?.id || null,
date_of_birth: profile.dateOfBirth,
date_of_death: profile.dateOfDeath,
gender: profile.gender,
@ -268,8 +266,7 @@ async function curateProfile(profile) {
name: profile.name,
avatar: profile.avatar,
scraper: profile.scraper,
site: profile.site,
network: profile.network,
entity: profile.entity,
update: profile.update,
};
@ -343,7 +340,7 @@ async function curateProfile(profile) {
const { href } = new URL(social);
return href;
} catch (error) {
logger.warn(`Profile scraper for '${profile.site.name}' returned invalid social link: ${social}`);
logger.warn(`Profile scraper for '${profile.entity.name}' returned invalid social link: ${social}`);
return null;
}
}).filter(Boolean)
@ -351,9 +348,9 @@ async function curateProfile(profile) {
curatedProfile.releases = toBaseReleases(profile.releases);
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.ethnicity}`);
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.hairColor || profile.hair}`);
if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.eyes}`);
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`);
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`);
if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.entity.name}' scraper: ${profile.eyes}`);
return curatedProfile;
} catch (error) {
@ -499,7 +496,7 @@ async function upsertProfiles(profiles) {
}
}
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId) {
async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId) {
const profiles = Promise.map(sources, async (source) => {
try {
// config may group sources to try until success
@ -507,24 +504,25 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, exist
try {
const scraper = scrapers[scraperSlug];
const context = {
site: sitesBySlug[scraperSlug] || null,
network: networksBySlug[scraperSlug] || sitesBySlug[scraperSlug]?.network || null,
site: entitiesBySlug[scraperSlug] || null,
network: entitiesBySlug[scraperSlug] || null,
entity: entitiesBySlug[scraperSlug] || null,
scraper: scraperSlug,
};
const label = context.site?.name || context.network?.name;
const label = context.entity?.name;
if (!scraper?.fetchProfile) {
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
}
if (!context.site && !context.network) {
logger.warn(`No site or network found for ${scraperSlug}`);
throw new Error(`No site or network found for ${scraperSlug}`);
if (!context.entity) {
logger.warn(`No entity found for ${scraperSlug}`);
throw new Error(`No entity found for ${scraperSlug}`);
}
const existingProfile = existingProfilesByActorNetworkSiteId[actor.id]?.[context.network?.id || null]?.[context.site?.id || null];
const existingProfile = existingProfilesByActorEntityId[actor.id]?.[context.entity?.id || null];
if (existingProfile && !argv.force) {
logger.verbose(`Found existing profile for '${actor.name}' on '${label}', use --force to scrape again`);
@ -574,20 +572,14 @@ async function scrapeActors(actorNames) {
const baseActors = toBaseActors(actorNames);
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
const siteSlugs = sources.flat();
const entitySlugs = sources.flat();
const [networks, sites, existingActorEntries] = await Promise.all([
const [entities, existingActorEntries] = await Promise.all([
knex('entities')
.where('type', 2)
.whereIn('slug', siteSlugs),
knex('entities')
.select(
'entities.*',
'parents.name as network_name', 'parents.slug as network_slug', 'parents.url as network_url', 'parents.description as network_description', 'parents.parameters as network_parameters',
)
.where('type', 2)
.whereIn('entities.slug', siteSlugs)
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id'),
.select(knex.raw('entities.*, row_to_json(parents) as parent'))
.whereIn('entities.slug', entitySlugs)
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id')
.orderBy('entities.type'),
knex('actors')
.select(['id', 'name', 'slug'])
.modify((queryBuilder) => {
@ -598,8 +590,7 @@ async function scrapeActors(actorNames) {
.whereNull('alias_for'),
]);
const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: network }), {});
const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: curateSite(site) }), {});
const entitiesBySlug = entities.reduce((acc, entity) => ({ ...acc, [entity.slug]: entity }), {});
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
@ -611,20 +602,17 @@ async function scrapeActors(actorNames) {
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
const existingProfiles = await knex('actors_profiles').whereIn('actor_id', actors.map(actor => actor.id));
const existingProfilesByActorNetworkSiteId = existingProfiles.reduce((acc, profile) => ({
const existingProfilesByActorEntityId = existingProfiles.reduce((acc, profile) => ({
...acc,
[profile.actor_id]: {
...acc[profile.actor_id],
[profile.entity_id]: {
...acc[profile.entity_id],
[profile.site_id]: profile,
},
[profile.entity_id]: profile,
},
}), {});
const profilesPerActor = await Promise.map(
actors,
async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId),
async actor => scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId),
{ concurrency: 10 },
);
@ -647,13 +635,11 @@ async function scrapeActors(actorNames) {
}
async function getOrCreateActors(baseActors, batchId) {
console.log(baseActors);
const existingActors = await knex('actors')
.select('id', 'alias_for', 'name', 'slug', 'entity_id')
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
.whereNull('entity_id')
.orWhereIn(['slug', 'entity_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.network.id]));
.orWhereIn(['slug', 'entity_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.entity.id]));
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
@ -664,7 +650,7 @@ async function getOrCreateActors(baseActors, batchId) {
},
}), {});
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.network.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]);
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]);
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
const newActors = await knex('actors').insert(curatedActorEntries, ['id', 'alias_for', 'name', 'slug', 'entity_id']);
@ -722,7 +708,7 @@ async function fetchActor(actorId) {
const actor = await knex('actors')
.select(knex.raw(`
actors.*,
row_to_json(networks) as network,
row_to_json(entities) as entity,
row_to_json(actor_alias) as alias,
row_to_json(birth_country) as birth_country,
row_to_json(residence_country) as residence_country,
@ -737,7 +723,7 @@ async function fetchActor(actorId) {
queryBuilder.where('actors.id', actorId);
})
.leftJoin('actors as actor_alias', 'actor_alias.id', 'actors.alias_for')
.leftJoin('networks', 'networks.id', 'actors.entity_id')
.leftJoin('entities', 'entities.id', 'actors.entity_id')
.leftJoin('countries as birth_country', 'birth_country.alpha2', 'actors.birth_country_alpha2')
.leftJoin('countries as residence_country', 'residence_country.alpha2', 'actors.residence_country_alpha2')
.leftJoin('media', 'media.id', 'actors.avatar_media_id')

View File

@ -7,8 +7,6 @@ const include = require('./utils/argv-include')(argv);
const logger = require('./logger')(__filename);
const knex = require('./knex');
const scrapers = require('./scrapers/scrapers');
const { curateSites } = require('./sites');
const { curateNetworks } = require('./networks');
function urlToSiteSlug(url) {
try {
@ -19,40 +17,31 @@ function urlToSiteSlug(url) {
return slug;
} catch (error) {
logger.warn(`Failed to derive site slug from '${url}': ${error.message}`);
logger.warn(`Failed to derive entity slug from '${url}': ${error.message}`);
return null;
}
}
async function findSites(baseReleases) {
const baseReleasesWithoutSite = baseReleases.filter(release => release.url && !release.site);
async function findEntities(baseReleases) {
const baseReleasesWithoutEntity = baseReleases.filter(release => release.url && !release.site && !release.entity);
const siteSlugs = Array.from(new Set(
baseReleasesWithoutSite
const entitySlugs = Array.from(new Set(
baseReleasesWithoutEntity
.map(baseRelease => urlToSiteSlug(baseRelease.url))
.filter(Boolean),
));
const siteEntries = await knex('entities')
const entities = await knex('entities')
.select(knex.raw('entities.*, row_to_json(parents) as parent'))
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id')
.select('entities.*', 'parents.id as network_id', 'parents.name as network_name', 'parents.slug as network_slug', 'parents.url as network_url', 'parents.parameters as network_parameters', 'parents.description as network_description')
.where('entities.type', 2)
.whereIn('entities.slug', siteSlugs);
.whereIn('entities.slug', entitySlugs)
.orderBy('entities.type', 'asc');
const networkEntries = await knex('entities')
.where('type', 1)
.whereIn('slug', siteSlugs);
// channel entity will overwrite network entity
const entitiesBySlug = entities.reduce((accEntities, entity) => ({ ...accEntities, [entity.slug]: entity }), {});
const sites = await curateSites(siteEntries, true, false);
const networks = await curateNetworks(networkEntries, true, false, false);
const markedNetworks = networks.map(network => ({ ...network, isNetwork: true }));
const sitesBySlug = []
.concat(markedNetworks, sites)
.reduce((accSites, site) => ({ ...accSites, [site.slug]: site }), {});
return sitesBySlug;
return entitiesBySlug;
}
function toBaseReleases(baseReleasesOrUrls) {
@ -92,23 +81,22 @@ function toBaseReleases(baseReleasesOrUrls) {
.filter(Boolean);
}
async function scrapeRelease(baseRelease, sites, type = 'scene') {
const site = baseRelease.site || sites[urlToSiteSlug(baseRelease.url)];
async function scrapeRelease(baseRelease, entities, type = 'scene') {
const entity = baseRelease.entity || baseRelease.site || entities[urlToSiteSlug(baseRelease.url)];
if (!site) {
logger.warn(`No site available for ${baseRelease.url}`);
if (!entity) {
logger.warn(`No entity available for ${baseRelease.url}`);
return baseRelease;
}
if ((!baseRelease.url && !baseRelease.path) || !argv.deep) {
return {
...baseRelease,
site,
entity,
};
}
const siteWithFallbackNetwork = site.isNetwork ? { ...site, network: site } : site; // make site.network available, even when site is network fallback
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
const scraper = scrapers.releases[entity.slug] || scrapers.releases[entity.parent?.slug];
if (!scraper) {
logger.warn(`Could not find scraper for ${baseRelease.url}`);
@ -116,7 +104,7 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') {
}
if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) {
logger.warn(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
logger.warn(`The '${entity.name}'-scraper cannot fetch individual ${type}s`);
return baseRelease;
}
@ -124,14 +112,14 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') {
logger.verbose(`Fetching ${type} ${baseRelease.url}`);
const scrapedRelease = type === 'scene'
? await scraper.fetchScene(baseRelease.url, siteWithFallbackNetwork, baseRelease, null, include)
: await scraper.fetchMovie(baseRelease.url, siteWithFallbackNetwork, baseRelease, null, include);
? await scraper.fetchScene(baseRelease.url, entity, baseRelease, null, include)
: await scraper.fetchMovie(baseRelease.url, entity, baseRelease, null, include);
const mergedRelease = {
...baseRelease,
...scrapedRelease,
deep: !!scrapedRelease,
site,
entity,
};
if (!mergedRelease.entryId) {
@ -155,19 +143,19 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') {
}
}
async function scrapeReleases(baseReleases, sites, type) {
async function scrapeReleases(baseReleases, entities, type) {
return Promise.map(
baseReleases,
async baseRelease => scrapeRelease(baseRelease, sites, type),
async baseRelease => scrapeRelease(baseRelease, entities, type),
{ concurrency: 10 },
);
}
async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
const baseReleases = toBaseReleases(baseReleasesOrUrls);
const sites = await findSites(baseReleases);
const entities = await findEntities(baseReleases);
const deepReleases = await scrapeReleases(baseReleases, sites, type);
const deepReleases = await scrapeReleases(baseReleases, entities, type);
return deepReleases.filter(Boolean);
}

View File

@ -3,21 +3,21 @@
const util = require('util');
const knex = require('../knex');
const { get, geta, ed, fd, ctxa } = require('../utils/q');
const { get, geta, ed, formatDate, ctxa } = require('../utils/q');
const slugify = require('../utils/slugify');
const { feetInchesToCm } = require('../utils/convert');
async function getChannelRegExp(site) {
if (!['hushpass', 'interracialpass'].includes(site.network.slug)) return null;
if (!['hushpass', 'interracialpass'].includes(site.parent.slug)) return null;
const sites = await knex('sites').where('network_id', site.network.id);
const sites = await knex('sites').where('network_id', site.parent.id);
return new RegExp(sites.map(channel => channel.parameters?.match || channel.name).join('|'), 'i');
}
function deriveEntryId(release) {
if (release.date && release.title) {
return `${slugify(fd(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`;
return `${slugify(formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`;
}
return null;
@ -140,7 +140,7 @@ function scrapeScene({ html, qu }, site, url, baseRelease) {
release.title = qu.q('.centerwrap h2', true);
release.description = qu.q('.videocontent p', true);
release.date = qu.date('.videodetails .date', 'MM/DD/YYYY');
release.date = qu.date('.videodetails .date', ['MM/DD/YYYY', 'YYYY-MM-DD']);
release.duration = qu.dur('.videodetails .date');
release.actors = qu.all('.modelname a', true);

View File

@ -8,7 +8,7 @@ const knex = require('./knex');
const slugify = require('./utils/slugify');
const { associateActors, scrapeActors } = require('./actors');
const { associateReleaseTags } = require('./tags');
const { curateSite } = require('./sites');
const { curateEntity } = require('./entities');
const { associateReleaseMedia } = require('./media');
function curateReleaseEntry(release, batchId, existingRelease) {
@ -20,9 +20,9 @@ function curateReleaseEntry(release, batchId, existingRelease) {
const curatedRelease = {
title: release.title,
entry_id: release.entryId || null,
entity_id: release.site?.id,
shoot_id: release.shootId || null,
entity_id: release.entity.id,
studio_id: release.studio?.id || null,
shoot_id: release.shootId || null,
url: release.url,
date: Number(release.date) ? release.date : null,
slug,
@ -45,51 +45,47 @@ function curateReleaseEntry(release, batchId, existingRelease) {
return curatedRelease;
}
async function attachChannelSites(releases) {
const releasesWithoutSite = releases.filter(release => release.channel && (!release.site || release.site.isNetwork || release.site.slug !== release.channel));
async function attachChannelEntities(releases) {
const releasesWithoutEntity = releases.filter(release => release.channel && !release.entity && release.entity.type !== 1);
const channelSites = await knex('entities')
.leftJoin('entities AS parents', 'parents.id', 'entities.parent_id')
.select('entities.*', 'parents.name as network_name', 'parents.slug as network_slug', 'parents.url as network_url', 'parents.parameters as network_parameters', 'parents.description as network_description')
.whereIn('entities.slug', releasesWithoutSite.map(release => release.channel));
const channelEntities = await knex('entities')
.select(knex.raw('entities.*, row_to_json(parents) as parent'))
.whereIn('entities.slug', releasesWithoutEntity.map(release => release.channel))
.where('entities.type', 2)
.leftJoin('entities AS parents', 'parents.id', 'entities.parent_id');
const channelSitesBySlug = channelSites.reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
const channelEntitiesBySlug = channelEntities.reduce((acc, entity) => ({ ...acc, [entity.slug]: entity }), {});
const releasesWithChannelSite = await Promise.all(releases
const releasesWithChannelEntity = await Promise.all(releases
.map(async (release) => {
if (release.channel && channelSitesBySlug[release.channel]) {
const curatedSite = await curateSite(channelSitesBySlug[release.channel]);
if (release.channel && channelEntitiesBySlug[release.channel]) {
const curatedEntity = await curateEntity(channelEntitiesBySlug[release.channel]);
return {
...release,
site: curatedSite,
entity: curatedEntity,
};
}
if (release.site && !release.site.isNetwork) {
if (release.entity) {
return release;
}
if (release.site && release.site.isNetwork) {
return {
...release,
site: null,
network: release.site,
};
}
logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`);
return null;
}));
return releasesWithChannelSite.filter(Boolean);
return releasesWithChannelEntity.filter(Boolean);
}
async function attachStudios(releases) {
const studioSlugs = releases.map(release => release.studio).filter(Boolean);
const studios = await knex('studios').whereIn('slug', studioSlugs);
const studios = await knex('entities')
.whereIn('slug', studioSlugs)
.where('type', 3);
const studioBySlug = studios.reduce((acc, studio) => ({ ...acc, [studio.slug]: studio }), {});
const releasesWithStudio = releases.map((release) => {
@ -111,7 +107,7 @@ async function attachStudios(releases) {
}
function attachReleaseIds(releases, storedReleases) {
const storedReleaseIdsBySiteIdAndEntryId = storedReleases.reduce((acc, release) => {
const storedReleaseIdsByEntityIdAndEntryId = storedReleases.reduce((acc, release) => {
if (!acc[release.entity_id]) acc[release.entity_id] = {};
acc[release.entity_id][release.entry_id] = release.id;
@ -120,29 +116,29 @@ function attachReleaseIds(releases, storedReleases) {
const releasesWithId = releases.map(release => ({
...release,
id: storedReleaseIdsBySiteIdAndEntryId[release.site.id][release.entryId],
id: storedReleaseIdsByEntityIdAndEntryId[release.entity.id][release.entryId],
}));
return releasesWithId;
}
function filterInternalDuplicateReleases(releases) {
const releasesBySiteIdAndEntryId = releases.reduce((acc, release) => {
if (!release.site) {
const releasesByEntityIdAndEntryId = releases.reduce((acc, release) => {
if (!release.entity) {
return acc;
}
if (!acc[release.site.id]) {
acc[release.site.id] = {};
if (!acc[release.entity.id]) {
acc[release.entity.id] = {};
}
acc[release.site.id][release.entryId] = release;
acc[release.entity.id][release.entryId] = release;
return acc;
}, {});
return Object.values(releasesBySiteIdAndEntryId)
.map(siteReleases => Object.values(siteReleases))
return Object.values(releasesByEntityIdAndEntryId)
.map(entityReleases => Object.values(entityReleases))
.flat();
}
@ -150,17 +146,17 @@ async function filterDuplicateReleases(releases) {
const internalUniqueReleases = filterInternalDuplicateReleases(releases);
const duplicateReleaseEntries = await knex('releases')
.whereIn(['entry_id', 'entity_id'], internalUniqueReleases.map(release => [release.entryId, release.site.id]));
.whereIn(['entry_id', 'entity_id'], internalUniqueReleases.map(release => [release.entryId, release.entity.id]));
const duplicateReleasesBySiteIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => {
const duplicateReleasesByEntityIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => {
if (!acc[release.entity_id]) acc[release.entity_id] = {};
acc[release.entity_id][release.entry_id] = true;
return acc;
}, {});
const duplicateReleases = internalUniqueReleases.filter(release => duplicateReleasesBySiteIdAndEntryId[release.site.id]?.[release.entryId]);
const uniqueReleases = internalUniqueReleases.filter(release => !duplicateReleasesBySiteIdAndEntryId[release.site.id]?.[release.entryId]);
const duplicateReleases = internalUniqueReleases.filter(release => duplicateReleasesByEntityIdAndEntryId[release.entity.id]?.[release.entryId]);
const uniqueReleases = internalUniqueReleases.filter(release => !duplicateReleasesByEntityIdAndEntryId[release.entity.id]?.[release.entryId]);
return {
uniqueReleases,
@ -216,10 +212,10 @@ async function storeReleases(releases) {
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
const releasesWithSites = await attachChannelSites(releases);
const releasesWithStudios = await attachStudios(releasesWithSites);
const releasesWithChannels = await attachChannelEntities(releases);
const releasesWithStudios = await attachStudios(releasesWithChannels);
// uniqueness is site ID + entry ID, filter uniques after adding sites
// uniqueness is entity ID + entry ID, filter uniques after adding entities
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);
const curatedNewReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId));

View File

@ -27,27 +27,27 @@ async function matchReleaseTags(releases) {
return tagIdsBySlug;
}
async function getSiteTags(releases) {
const siteIds = releases.map(release => release.site.id);
const siteTags = await knex('sites_tags').whereIn('site_id', siteIds);
async function getEntityTags(releases) {
const entityIds = releases.map(release => release.entity.id);
const entityTags = await knex('entities_tags').whereIn('entity_id', entityIds);
const siteTagIdsBySiteId = siteTags.reduce((acc, siteTag) => {
if (!acc[siteTag.site_id]) {
acc[siteTag.site_id] = [];
const entityTagIdsByEntityId = entityTags.reduce((acc, entityTag) => {
if (!acc[entityTag.entity_id]) {
acc[entityTag.entity_id] = [];
}
acc[siteTag.site_id].push(siteTag.tag_id);
acc[entityTag.entity_id].push(entityTag.tag_id);
return acc;
}, {});
return siteTagIdsBySiteId;
return entityTagIdsByEntityId;
}
function buildReleaseTagAssociations(releases, tagIdsBySlug, siteTagIdsBySiteId) {
function buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntityId) {
const tagAssociations = releases
.map((release) => {
const siteTagIds = siteTagIdsBySiteId[release.site.id];
const entityTagIds = entityTagIdsByEntityId[release.entity.id];
const releaseTags = release.tags || [];
const releaseTagIds = releaseTags.every(tag => typeof tag === 'number')
@ -57,7 +57,7 @@ function buildReleaseTagAssociations(releases, tagIdsBySlug, siteTagIdsBySiteId)
const tags = [...new Set(
// filter duplicates and empties
releaseTagIds
.concat(siteTagIds)
.concat(entityTagIds)
.filter(Boolean),
)]
.map(tagId => ({
@ -94,9 +94,9 @@ async function filterUniqueAssociations(tagAssociations) {
async function associateReleaseTags(releases) {
const tagIdsBySlug = await matchReleaseTags(releases);
const siteTagIdsBySiteId = await getSiteTags(releases);
const EntityTagIdsByEntityId = await getEntityTags(releases);
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, siteTagIdsBySiteId);
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, EntityTagIdsByEntityId);
const uniqueAssociations = await filterUniqueAssociations(tagAssociations);
await knex('releases_tags').insert(uniqueAssociations);

View File

@ -196,12 +196,12 @@ async function scrapeSite(site, accSiteReleases) {
}
}
async function scrapeNetworkSequential(network) {
async function scrapeNetworkSequential(networkEntity) {
return Promise.reduce(
network.sites,
async (chain, site) => {
networkEntity.children,
async (chain, siteEntity) => {
const accSiteReleases = await chain;
const siteReleases = await scrapeSite(site, network, accSiteReleases);
const siteReleases = await scrapeSite(siteEntity, networkEntity, accSiteReleases);
return accSiteReleases.concat(siteReleases);
},
@ -209,10 +209,10 @@ async function scrapeNetworkSequential(network) {
);
}
async function scrapeNetworkParallel(network) {
async function scrapeNetworkParallel(networkEntity) {
return Promise.map(
network.children,
async site => scrapeSite(site, network),
networkEntity.children,
async siteEntity => scrapeSite(siteEntity, networkEntity),
{ concurrency: 3 },
);
}

View File

@ -45,7 +45,7 @@ function slugify(string, delimiter = '-', {
return string;
}
const slugComponents = string.trim().toLowerCase().match(/[A-Za-zÀ-ÖØ-öø-ÿ]+/g);
const slugComponents = string.trim().toLowerCase().match(/[A-Za-zÀ-ÖØ-öø-ÿ0-9]+/g);
if (!slugComponents) {
return '';