diff --git a/migrations/20190325001339_releases.js b/migrations/20190325001339_releases.js index 0db84781..0980025a 100644 --- a/migrations/20190325001339_releases.js +++ b/migrations/20190325001339_releases.js @@ -352,7 +352,7 @@ exports.up = knex => Promise.resolve() table.string('shoot_id'); table.string('entry_id'); - table.unique(['site_id', 'entry_id']); + table.unique(['site_id', 'entry_id', 'type']); table.string('url', 1000); table.string('title'); diff --git a/public/img/logos/ddfnetwork/misc/busty-lover.png b/public/img/logos/ddfnetwork/misc/busty-lover.png index c15dcf62..1cf8ab13 100644 Binary files a/public/img/logos/ddfnetwork/misc/busty-lover.png and b/public/img/logos/ddfnetwork/misc/busty-lover.png differ diff --git a/public/img/logos/ddfnetwork/misc/fuck-in-hd.png b/public/img/logos/ddfnetwork/misc/fuck-in-hd.png index 2b7a82e0..af02e1d7 100644 Binary files a/public/img/logos/ddfnetwork/misc/fuck-in-hd.png and b/public/img/logos/ddfnetwork/misc/fuck-in-hd.png differ diff --git a/public/img/logos/ddfnetwork/misc/porn-world.png b/public/img/logos/ddfnetwork/misc/porn-world.png new file mode 100644 index 00000000..18a20875 Binary files /dev/null and b/public/img/logos/ddfnetwork/misc/porn-world.png differ diff --git a/public/img/logos/ddfnetwork/misc/porn-world_basic.png b/public/img/logos/ddfnetwork/misc/porn-world_basic.png new file mode 100644 index 00000000..1bcc73ba Binary files /dev/null and b/public/img/logos/ddfnetwork/misc/porn-world_basic.png differ diff --git a/public/img/logos/ddfnetwork/misc/porn-world_basic.svg b/public/img/logos/ddfnetwork/misc/porn-world_basic.svg new file mode 100644 index 00000000..77b4e705 --- /dev/null +++ b/public/img/logos/ddfnetwork/misc/porn-world_basic.svg @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/public/img/tags/anal/2.jpeg b/public/img/tags/anal/2.jpeg new file mode 100644 index 00000000..df5270a0 Binary files /dev/null and b/public/img/tags/anal/2.jpeg differ diff --git a/public/img/tags/anal/2_thumb.jpeg b/public/img/tags/anal/2_thumb.jpeg new file mode 100644 index 00000000..1c4adba3 Binary files /dev/null and b/public/img/tags/anal/2_thumb.jpeg differ diff --git a/public/img/tags/caucasian/1.jpeg b/public/img/tags/caucasian/1.jpeg new file mode 100644 index 00000000..4223a087 Binary files /dev/null and b/public/img/tags/caucasian/1.jpeg differ diff --git a/public/img/tags/caucasian/1_thumb.jpeg b/public/img/tags/caucasian/1_thumb.jpeg new file mode 100644 index 00000000..8183bc61 Binary files /dev/null and b/public/img/tags/caucasian/1_thumb.jpeg differ diff --git a/public/img/tags/double-anal/6.jpeg b/public/img/tags/double-anal/6.jpeg new file mode 100644 index 00000000..46cd0c3a Binary files /dev/null and b/public/img/tags/double-anal/6.jpeg differ diff --git a/public/img/tags/double-anal/6_thumb.jpeg b/public/img/tags/double-anal/6_thumb.jpeg new file mode 100644 index 00000000..6914c354 Binary files /dev/null and b/public/img/tags/double-anal/6_thumb.jpeg differ diff --git a/seeds/04_media.js b/seeds/04_media.js index 690a06b3..93f41d5a 100644 --- a/seeds/04_media.js +++ b/seeds/04_media.js @@ -1,40 +1,40 @@ const upsert = require('../src/utils/upsert'); const tagPosters = [ + ['airtight', 1, 'Jynx Maze in "Pump My Ass Full of Cum 3" for Jules Jordan'], + ['anal', 2, 'Sheena Shaw for Bang Bros'], ['anal-creampie', 0, 'Gina Valentina and Jane Wilde in "A Very Special Anniversary" for Tushy'], + ['asian', 'poster', 'Vina Sky in "Young and Glamorous 10" for Jules Jordan'], ['ass-to-mouth', 'poster', 'Alysa Gap and Logan in "Anal Buffet 4" for Evil Angel'], ['bdsm', 0, 'Dani Daniels in "The Traning of Dani Daniels, Day 2" for The Training of O at Kink'], ['behind-the-scenes', 0, 'Janice Griffith in "Day With A Pornstar: Janice" for Brazzers'], + ['blowbang', 'poster'], + ['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'], + ['bukkake', 'poster'], + ['caucasian', 1, 'Sheena Shaw for Brazzers'], + ['creampie', 'poster'], ['da-tp', 0, 'Natasha Teen in LegalPorno SZ2164'], ['deepthroat', 0, 'Chanel Grey in "Deepthroating Is Fun" for Throated'], - ['double-anal', 5, 'Riley Reid in "The Gangbang of Riley Reid" for Jules Jordan'], + ['double-anal', 2, 'Lana Rhoades in "Lana Rhoades Unleashed" for HardX'], ['double-penetration', 'poster', 'Mia Malkova in "DP!" for HardX'], ['double-vaginal', 'poster', 'Riley Reid in "Pizza That Ass" for Reid My Lips'], ['dv-tp', 'poster', 'Juelz Ventura in "Gangbanged 5" for Elegant Angel'], - ['oral-creampie', 1, 'Keisha Grey in Brazzers House'], - ['triple-anal', 'poster', 'Kristy Black in SZ1986 for LegalPorno'], - ['airtight', 1, 'Jynx Maze in "Pump My Ass Full of Cum 3" for Jules Jordan'], - ['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'], - ['asian', 'poster', 'Vina Sky in "Young and Glamorous 10" for Jules Jordan'], - ['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'], - ['blowbang', 'poster'], - ['bukkake', 'poster'], - ['caucasian', 'poster'], - ['creampie', 'poster'], ['ebony', 1, 'Sarah Banks for Brazzers'], - ['facial', 'poster'], ['facefucking', '1', 'Carrie for Young Throats'], + ['facial', 'poster'], ['gangbang', 'poster', 'Kristen Scott in "Interracial Gangbang!" for Jules Jordan'], ['gaping', 1, 'Vina Sky in "Vina Sky Does Anal" for HardX'], ['interracial', 'poster'], ['latina', 'poster'], ['mff', 0, 'Madison Ivy and Adriana Chechik in "Day With A Pornstar" for Brazzers'], ['mfm', 'poster'], + ['oral-creampie', 1, 'Keisha Grey in Brazzers House'], ['orgy', 'poster'], ['schoolgirl', 1, 'Eliza Ibarra for Brazzers'], ['swallowing', 'poster'], ['tattoo', 'poster', 'Kali Roses in "Goes All In For Anal" for Hussie Pass'], ['trainbang', 'poster', 'Kali Roses in "Passing Me Around" for Blacked'], + ['triple-anal', 'poster', 'Kristy Black in SZ1986 for LegalPorno'], ] .map(([slug, filename, comment], index) => ({ tagSlug: slug, @@ -49,12 +49,15 @@ const tagPhotos = [ ['airtight', 2, 'Dakota Skye in "Dakota Goes Nuts" for ArchAngel'], ['airtight', 3, 'Anita Bellini in "Triple Dick Gangbang" for Hands On Hardcore (DDF Network)'], ['anal', 'poster', 'Jynx Maze in "Anal Buffet 6" for Evil Angel'], + ['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'], ['anal', 0], + ['caucasian', 'poster'], ['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'], ['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'], ['da-tp', 3, 'Evelina Darling in GIO294'], ['da-tp', 4, 'Ninel Mojado aka Mira Cuckold in GIO063 for LegalPorno'], - ['double-anal', 2, 'Lana Rhoades in "Gangbang Me 3" for HardX'], + ['double-anal', 6, 'Sheena Shaw in "Ass Worship 14" for Jules Jordan'], + ['double-anal', 5, 'Riley Reid in "The Gangbang of Riley Reid" for Jules Jordan'], ['double-anal', 'poster', 'Haley Reed in "Young Hot Ass" for Evil Angel'], ['double-anal', 0, 'Nicole Black doing double anal during a gangbang in GIO971 for LegalPorno'], ['double-anal', 1, 'Ria Sunn in SZ1801 for LegalPorno'], diff --git a/src/app.js b/src/app.js index 130bc118..e9fc1171 100644 --- a/src/app.js +++ b/src/app.js @@ -24,18 +24,19 @@ async function init() { } const updateBaseScenes = (argv.scrape || argv.sites || argv.networks) && await fetchUpdates(); - const deepScenes = await fetchScenes([...(argv.scenes || []), ...(updateBaseScenes || [])]); + const deepScenes = argv.deep && await fetchScenes([...(argv.scenes || []), ...(updateBaseScenes || [])]); - console.log(deepScenes.map(scene => scene.movie)); + const sceneMovies = deepScenes && argv.sceneMovies && deepScenes.map(scene => scene.movie).filter(Boolean); + const deepMovies = await fetchMovies([...(argv.movies || []), ...(sceneMovies || [])]); - const argvDeepMovies = argv.movies && await fetchMovies(argv.movies); + if (argv.save) { + await storeReleases([ + ...(deepScenes || []), + ...(deepMovies || []), + ]); - await storeReleases([ - ...(deepScenes || []), - ...(argvDeepMovies || []), - ]); - - // await storeReleaseActors(updateReleases); + // await storeReleaseActors(updateReleases); + } knex.destroy(); } diff --git a/src/argv.js b/src/argv.js index 9abef0d1..e01ac93d 100644 --- a/src/argv.js +++ b/src/argv.js @@ -29,21 +29,27 @@ const { argv } = yargs type: 'array', alias: 'actor', }) - .option('with-scenes', { - describe: 'Fetch all scenes for an actor or movie', + .option('actor-scenes', { + describe: 'Fetch all scenes for an actor', type: 'boolean', alias: 'with-releases', default: false, }) - .option('with-movies', { + .option('movie-scenes', { + describe: 'Fetch all scenes for a movie', + type: 'boolean', + alias: 'with-releases', + default: false, + }) + .option('scene-movies', { describe: 'Fetch movies for scenes', type: 'boolean', default: true, }) - .option('with-profiles', { + .option('profiles', { describe: 'Scrape profiles for new actors after fetching scenes', type: 'boolean', - alias: 'with-actors', + alias: 'bios', default: false, }) .option('scene', { diff --git a/src/deep.js b/src/deep.js index 441e3b09..9189b422 100644 --- a/src/deep.js +++ b/src/deep.js @@ -99,7 +99,7 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') { }; } - const scraper = scrapers.releases[site.slug]; + const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug]; if (!scraper) { logger.warn(`Could not find scraper for ${baseRelease.url}`); @@ -124,6 +124,7 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') { }; if (scrapedRelease && baseRelease?.tags) { + // accumulate all available tags mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags); } diff --git a/src/scrapers/ddfnetwork.js b/src/scrapers/ddfnetwork.js index 590d7bf8..ab70ccb9 100644 --- a/src/scrapers/ddfnetwork.js +++ b/src/scrapers/ddfnetwork.js @@ -141,7 +141,9 @@ async function fetchLatest(site, page = 1) { } async function fetchScene(url, site) { - const res = await bhttp.get(`https://ddfnetwork.com${new URL(url).pathname}`); + // DDF's main site moved to Porn World + // const res = await bhttp.get(`https://ddfnetwork.com${new URL(url).pathname}`); + const res = await bhttp.get(url); return scrapeScene(res.body.toString(), url, site); } diff --git a/src/store-releases.js b/src/store-releases.js index 0a938a1c..2afd2dfa 100644 --- a/src/store-releases.js +++ b/src/store-releases.js @@ -2,10 +2,10 @@ const config = require('config'); -const argv = require('./argv'); const logger = require('./logger')(__filename); const knex = require('./knex'); const slugify = require('./utils/slugify'); +const { associateTags } = require('./tags'); function curateReleaseEntry(release, batchId, existingRelease) { const slug = slugify(release.title, '-', { @@ -34,7 +34,7 @@ function curateReleaseEntry(release, batchId, existingRelease) { updated_batch_id: batchId, }; - if (!existingRelease) { + if (!existingRelease && !release.id) { curatedRelease.created_batch_id = batchId; } @@ -60,7 +60,7 @@ async function attachChannelSites(releases) { }; } - logger.error(`Unable to match channel '${release.channel.slug || release.channel}' from generic URL ${release.url}`); + logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`); return null; }) @@ -93,15 +93,41 @@ async function attachStudios(releases) { return releasesWithStudio; } +function attachReleaseIds(releases, storedReleases) { + const storedReleaseIdsBySiteIdAndEntryId = storedReleases.reduce((acc, release) => { + if (!acc[release.site_id]) acc[release.site_id] = {}; + acc[release.site_id][release.entry_id] = release.id; + + return acc; + }, {}); + + const releasesWithId = releases.map(release => ({ + ...release, + id: storedReleaseIdsBySiteIdAndEntryId[release.site.id][release.entryId], + })); + + return releasesWithId; +} + async function extractUniqueReleases(releases) { const duplicateReleaseEntries = await knex('releases') .whereIn(['entry_id', 'site_id'], releases.map(release => [release.entryId, release.site.id])); - const duplicateReleaseEntryKeys = new Set(duplicateReleaseEntries.map(releaseEntry => `${releaseEntry.site_id}_${releaseEntry.entry_id}`)); - const duplicateReleases = releases.filter(release => duplicateReleaseEntryKeys.has(`${release.site.id}_${release.entryId}`)); - const uniqueReleases = releases.filter(release => !duplicateReleaseEntryKeys.has(`${release.site.id}_${release.entryId}`)); + const duplicateReleasesBySiteIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => { + if (!acc[release.site_id]) acc[release.site_id] = {}; + acc[release.site_id][release.entry_id] = true; - return { duplicateReleases, uniqueReleases }; + return acc; + }, {}); + + const duplicateReleases = releases.filter(release => duplicateReleasesBySiteIdAndEntryId[release.site.id]?.[release.entryId]); + const uniqueReleases = releases.filter(release => !duplicateReleasesBySiteIdAndEntryId[release.site.id]?.[release.entryId]); + + return { + uniqueReleases, + duplicateReleases, + duplicateReleaseEntries, + }; } async function storeReleases(releases) { @@ -111,19 +137,19 @@ async function storeReleases(releases) { const releasesWithStudios = await attachStudios(releasesWithSites); // uniqueness is site ID + entry ID, filter uniques after adding sites - const { uniqueReleases, duplicateReleases } = await extractUniqueReleases(releasesWithStudios); + const { uniqueReleases, duplicateReleaseEntries } = await extractUniqueReleases(releasesWithStudios); - console.log(argv.redownload, duplicateReleases); + const curatedNewReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId)); - const curatedReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId)); - const storedReleases = await knex('releases').insert(curatedReleaseEntries).returning('*'); + const storedReleases = await knex('releases').insert(curatedNewReleaseEntries).returning('*'); + // TODO: update duplicate releases - if (Array.isArray(storedReleases)) { - return storedReleases; - } + const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : []; + const releasesWithId = attachReleaseIds(releases, [].concat(storedReleaseEntries, duplicateReleaseEntries)); - // nothing inserted - return []; + await associateTags(releasesWithId); + + return releasesWithId; } module.exports = { diff --git a/src/tags-legacy.js b/src/tags-legacy.js new file mode 100644 index 00000000..c6f97442 --- /dev/null +++ b/src/tags-legacy.js @@ -0,0 +1,110 @@ +'use strict'; + +const logger = require('./logger')(__filename); +const knex = require('./knex'); +const whereOr = require('./utils/where-or'); + +async function curateTag(tag) { + const [aliases, media] = await Promise.all([ + knex('tags').where({ alias_for: tag.id }), + knex('media') + .where('domain', 'tags') + .andWhere('target_id', tag.id) + .orderBy('index'), + ]); + + return { + id: tag.id, + name: tag.name, + slug: tag.slug, + description: tag.description, + poster: media.find(photo => photo.role === 'poster'), + photos: media.filter(photo => photo.role === 'photo'), + group: { + id: tag.group_id, + name: tag.group_name, + description: tag.group_description, + slug: tag.group_slug, + }, + aliases: aliases.map(({ name }) => name), + }; +} + +function curateTags(tags) { + return Promise.all(tags.map(async tag => curateTag(tag))); +} + +async function matchTags(rawTags) { + const filteredTags = rawTags.filter(Boolean); + + const tags = filteredTags + .concat(filteredTags.map(tag => tag.toLowerCase())) + .concat(filteredTags.map(tag => tag.toUpperCase())); + + const tagEntries = await knex('tags') + .pluck('aliases.id') + .whereIn('tags.name', tags) + .leftJoin('tags as aliases', function join() { + this + .on('tags.alias_for', 'aliases.id') + .orOn('tags.id', 'aliases.id'); + }) + .where(function where() { + this + .whereNull('tags.alias_for') + .orWhereNull('aliases.alias_for'); + }) + .groupBy('aliases.id'); + + return tagEntries; +} + +async function associateTags(release, releaseId) { + const siteTags = release.site?.tags?.filter(tag => tag.inherit === true).map(tag => tag.id) || []; + + const rawReleaseTags = release.tags?.filter(Boolean) || []; + const releaseTags = rawReleaseTags.some(tag => typeof tag === 'string') + ? await matchTags(release.tags) // scraper returned raw tags + : rawReleaseTags; // tags already matched by (outdated) scraper + + const tags = Array.from(new Set(releaseTags.concat(siteTags))); + + if (tags.length === 0) { + logger.info(`No tags available for (${release.site.name}, ${releaseId}) "${release.title}"`); + return; + } + + const associationEntries = await knex('releases_tags') + .where('release_id', releaseId) + .whereIn('tag_id', tags); + + const existingAssociations = new Set(associationEntries.map(association => association.tag_id)); + const newAssociations = tags.filter(tagId => !existingAssociations.has(tagId)); + + await knex('releases_tags').insert(newAssociations.map(tagId => ({ + tag_id: tagId, + release_id: releaseId, + }))); +} + +async function fetchTags(queryObject, groupsQueryObject, limit = 100) { + const tags = await knex('tags') + .where(builder => whereOr(queryObject, 'tags', builder)) + .orWhere(builder => whereOr(groupsQueryObject, 'tags_groups', builder)) + .andWhere({ 'tags.alias_for': null }) + .select( + 'tags.*', + 'tags_groups.id as group_id', 'tags_groups.name as group_name', 'tags_groups.slug as group_slug', 'tags_groups.description as groups_description', + ) + .leftJoin('tags_groups', 'tags.group_id', 'tags_groups.id') + .orderBy('name') + .limit(limit); + + return curateTags(tags); +} + +module.exports = { + associateTags, + fetchTags, + matchTags, +}; diff --git a/src/tags.js b/src/tags.js index c6f97442..ad3d1e9d 100644 --- a/src/tags.js +++ b/src/tags.js @@ -1,110 +1,103 @@ 'use strict'; -const logger = require('./logger')(__filename); const knex = require('./knex'); -const whereOr = require('./utils/where-or'); +const slugify = require('./utils/slugify'); -async function curateTag(tag) { - const [aliases, media] = await Promise.all([ - knex('tags').where({ alias_for: tag.id }), - knex('media') - .where('domain', 'tags') - .andWhere('target_id', tag.id) - .orderBy('index'), - ]); +async function matchReleaseTags(releases) { + const rawTags = releases + .map(release => release.tags).flat() + .filter(Boolean); - return { - id: tag.id, - name: tag.name, - slug: tag.slug, - description: tag.description, - poster: media.find(photo => photo.role === 'poster'), - photos: media.filter(photo => photo.role === 'photo'), - group: { - id: tag.group_id, - name: tag.group_name, - description: tag.group_description, - slug: tag.group_slug, - }, - aliases: aliases.map(({ name }) => name), - }; -} - -function curateTags(tags) { - return Promise.all(tags.map(async tag => curateTag(tag))); -} - -async function matchTags(rawTags) { - const filteredTags = rawTags.filter(Boolean); - - const tags = filteredTags - .concat(filteredTags.map(tag => tag.toLowerCase())) - .concat(filteredTags.map(tag => tag.toUpperCase())); + const casedTags = Array.from(new Set( + rawTags + .concat(rawTags.map(tag => tag.toLowerCase())) + .concat(rawTags.map(tag => tag.toUpperCase())), + )); const tagEntries = await knex('tags') - .pluck('aliases.id') - .whereIn('tags.name', tags) - .leftJoin('tags as aliases', function join() { - this - .on('tags.alias_for', 'aliases.id') - .orOn('tags.id', 'aliases.id'); - }) - .where(function where() { - this - .whereNull('tags.alias_for') - .orWhereNull('aliases.alias_for'); - }) - .groupBy('aliases.id'); + .select('tags.id', 'tags.name', 'tags.alias_for') + .whereIn('tags.name', casedTags); - return tagEntries; + const tagIdsBySlug = tagEntries + .reduce((acc, tag) => ({ + ...acc, + [slugify(tag.name)]: tag.alias_for || tag.id, + }), {}); + + return tagIdsBySlug; } -async function associateTags(release, releaseId) { - const siteTags = release.site?.tags?.filter(tag => tag.inherit === true).map(tag => tag.id) || []; +async function getSiteTags(releases) { + const siteIds = releases.map(release => release.site.id); + const siteTags = await knex('sites_tags').whereIn('site_id', siteIds); - const rawReleaseTags = release.tags?.filter(Boolean) || []; - const releaseTags = rawReleaseTags.some(tag => typeof tag === 'string') - ? await matchTags(release.tags) // scraper returned raw tags - : rawReleaseTags; // tags already matched by (outdated) scraper + const siteTagIdsBySiteId = siteTags.reduce((acc, siteTag) => { + if (!acc[siteTag.site_id]) { + acc[siteTag.site_id] = []; + } - const tags = Array.from(new Set(releaseTags.concat(siteTags))); + acc[siteTag.site_id].push(siteTag.tag_id); - if (tags.length === 0) { - logger.info(`No tags available for (${release.site.name}, ${releaseId}) "${release.title}"`); - return; - } + return acc; + }, {}); - const associationEntries = await knex('releases_tags') - .where('release_id', releaseId) - .whereIn('tag_id', tags); - - const existingAssociations = new Set(associationEntries.map(association => association.tag_id)); - const newAssociations = tags.filter(tagId => !existingAssociations.has(tagId)); - - await knex('releases_tags').insert(newAssociations.map(tagId => ({ - tag_id: tagId, - release_id: releaseId, - }))); + return siteTagIdsBySiteId; } -async function fetchTags(queryObject, groupsQueryObject, limit = 100) { - const tags = await knex('tags') - .where(builder => whereOr(queryObject, 'tags', builder)) - .orWhere(builder => whereOr(groupsQueryObject, 'tags_groups', builder)) - .andWhere({ 'tags.alias_for': null }) - .select( - 'tags.*', - 'tags_groups.id as group_id', 'tags_groups.name as group_name', 'tags_groups.slug as group_slug', 'tags_groups.description as groups_description', - ) - .leftJoin('tags_groups', 'tags.group_id', 'tags_groups.id') - .orderBy('name') - .limit(limit); +function buildReleaseTagAssociations(releases, tagIdsBySlug, siteTagIdsBySiteId) { + const tagAssociations = releases + .map((release) => { + const siteTagIds = siteTagIdsBySiteId[release.site.id]; - return curateTags(tags); + const releaseTagIds = release.tags.every(tag => typeof tag === 'number') + ? release.tags // obsolete scraper returned pre-matched tags + : release.tags.map(tag => tagIdsBySlug[slugify(tag)]); + + return Array.from(new Set( + // filter duplicates and empties + releaseTagIds + .concat(siteTagIds) + .filter(Boolean), + )) + .map(tagId => ({ + release_id: release.id, + tag_id: tagId, + })); + }) + .flat(); + + return tagAssociations; +} + +async function extractUniqueAssociations(tagAssociations) { + const duplicateAssociations = await knex('releases_tags').whereIn(['release_id', 'tag_id'], tagAssociations.map(association => [association.release_id, association.tag_id])); + + const duplicateAssociationsByReleaseIdAndTagId = duplicateAssociations.reduce((acc, association) => { + if (!acc[association.release_id]) { + acc[association.release_id] = {}; + } + + acc[association.release_id][association.tag_id] = true; + + return acc; + }, {}); + + const uniqueAssociations = tagAssociations + .filter(association => !duplicateAssociationsByReleaseIdAndTagId[association.release_id]?.[association.tag_id]); + + return uniqueAssociations; +} + +async function associateTags(releases) { + const tagIdsBySlug = await matchReleaseTags(releases); + const siteTagIdsBySiteId = await getSiteTags(releases); + + const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, siteTagIdsBySiteId); + const uniqueAssociations = extractUniqueAssociations(tagAssociations); + + await knex('releases_tags').insert(uniqueAssociations); } module.exports = { associateTags, - fetchTags, - matchTags, }; diff --git a/src/updates.js b/src/updates.js index 3b027b5b..413d72b1 100644 --- a/src/updates.js +++ b/src/updates.js @@ -34,7 +34,7 @@ async function extractUniqueReleases(latestReleases, accReleases) { // add entry IDs of accumulated releases to prevent an infinite scrape loop // when one page contains the same release as the previous - const duplicateReleaseIdentifiers = duplicateReleases + const duplicateReleasesSiteIdAndEntryIds = duplicateReleases .concat(accReleases) .reduce((acc, release) => { const siteId = release.site_id || release.site.id; @@ -47,7 +47,7 @@ async function extractUniqueReleases(latestReleases, accReleases) { }, {}); const uniqueReleases = latestReleases - .filter(release => !duplicateReleaseIdentifiers[release.site.id]?.[release.entryId]); + .filter(release => !duplicateReleasesSiteIdAndEntryIds[release.site.id]?.[release.entryId]); return uniqueReleases; } diff --git a/src/utils/http.js b/src/utils/http.js index fb23637b..f83da774 100644 --- a/src/utils/http.js +++ b/src/utils/http.js @@ -52,11 +52,11 @@ queue.define('http', async ({ const reqOptions = { headers: { + ...(options.defaultHeaders !== false && defaultHeaders), ...headers, - ...defaultHeaders, }, - ...options, ...defaultOptions, + ...options, ...(options.timeout && { responseTimeout: options.timeout }), }; diff --git a/src/utils/qu.js b/src/utils/qu.js index 42691b02..a2991c86 100644 --- a/src/utils/qu.js +++ b/src/utils/qu.js @@ -288,9 +288,7 @@ function extractAll(htmlValue, selector) { } async function get(urlValue, selector, headers, options, queryAll = false) { - const res = await http.get(urlValue, { - headers, - }); + const res = await http.get(urlValue, headers); if (res.statusCode === 200) { const item = queryAll diff --git a/src/utils/slugify.js b/src/utils/slugify.js index f14371a3..97f02870 100644 --- a/src/utils/slugify.js +++ b/src/utils/slugify.js @@ -4,6 +4,10 @@ function slugify(string, delimiter = '-', { encode = false, limit = 1000, } = {}) { + if (!string) { + return ''; + } + const slugComponents = string.trim().toLowerCase().match(/\w+/g); if (!slugComponents) {