Improved release storage module. Added new tags module. Added movie scraping.

This commit is contained in:
ThePendulum 2020-03-22 03:50:24 +01:00
parent d765543b30
commit fdb2b132f6
24 changed files with 343 additions and 141 deletions

View File

@ -352,7 +352,7 @@ exports.up = knex => Promise.resolve()
table.string('shoot_id');
table.string('entry_id');
table.unique(['site_id', 'entry_id']);
table.unique(['site_id', 'entry_id', 'type']);
table.string('url', 1000);
table.string('title');

Binary file not shown.

Before

Width:  |  Height:  |  Size: 37 KiB

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 25 KiB

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

@ -0,0 +1,58 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- Generator: Adobe Illustrator 23.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0) -->
<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
viewBox="0 0 180 42" style="enable-background:new 0 0 180 42;" xml:space="preserve">
<style type="text/css">
.st0{fill:url(#SVGID_1_);}
.st1{fill:url(#SVGID_2_);}
.st2{fill:url(#SVGID_3_);}
.st3{fill:url(#SVGID_4_);}
.st4{fill:url(#SVGID_5_);}
.st5{fill:#FFFFFF;}
</style>
<g>
<linearGradient id="SVGID_1_" gradientUnits="userSpaceOnUse" x1="92.25" y1="10.6716" x2="92.25" y2="35.0304" gradientTransform="matrix(1 0 0 -1 0 44)">
<stop offset="0" style="stop-color:#E93F3A"/>
<stop offset="1" style="stop-color:#F48B40"/>
</linearGradient>
<polygon class="st0" points="98.9,9.5 97,20.4 94.7,9.5 89.8,9.5 87.6,20.4 85.6,9.5 78.9,9.5 84.3,32.8 90.3,32.8 92.2,22.7
94.2,32.8 100.2,32.8 105.6,9.5 "/>
<linearGradient id="SVGID_2_" gradientUnits="userSpaceOnUse" x1="115" y1="10.6716" x2="115" y2="35.0304" gradientTransform="matrix(1 0 0 -1 0 44)">
<stop offset="0" style="stop-color:#E93F3A"/>
<stop offset="1" style="stop-color:#F48B40"/>
</linearGradient>
<path class="st1" d="M115,9.2c-5.9,0-9.9,4.8-9.9,12s4,12,9.9,12s9.9-4.8,9.9-12S121,9.2,115,9.2z M115,27.5
c-2.6,0-3.5-3.4-3.5-6.3s0.9-6.3,3.5-6.3c2.7,0,3.6,3.4,3.6,6.3S117.7,27.5,115,27.5z"/>
<linearGradient id="SVGID_3_" gradientUnits="userSpaceOnUse" x1="135.55" y1="10.6716" x2="135.55" y2="35.0304" gradientTransform="matrix(1 0 0 -1 0 44)">
<stop offset="0" style="stop-color:#E93F3A"/>
<stop offset="1" style="stop-color:#F48B40"/>
</linearGradient>
<path class="st2" d="M140,23.9c1.8-1,3.4-3.1,3.4-6.6c0-4.5-3.1-7.7-7.5-7.7H127v23.3h6.2v-7.8h0.7l3.3,7.8h6.9L140,23.9z
M135.3,19.4h-2.1v-4.2h2.1c1.6,0,1.8,1.5,1.8,2.1C137.1,18.3,136.6,19.4,135.3,19.4z"/>
<linearGradient id="SVGID_4_" gradientUnits="userSpaceOnUse" x1="152.3" y1="10.6716" x2="152.3" y2="35.0304" gradientTransform="matrix(1 0 0 -1 0 44)">
<stop offset="0" style="stop-color:#E93F3A"/>
<stop offset="1" style="stop-color:#F48B40"/>
</linearGradient>
<polygon class="st3" points="151.8,27.1 151.8,9.5 145.6,9.5 145.6,32.8 159,32.8 159,27.1 "/>
<linearGradient id="SVGID_5_" gradientUnits="userSpaceOnUse" x1="169.8" y1="10.6716" x2="169.8" y2="35.0304" gradientTransform="matrix(1 0 0 -1 0 44)">
<stop offset="0" style="stop-color:#E93F3A"/>
<stop offset="1" style="stop-color:#F48B40"/>
</linearGradient>
<path class="st4" d="M168.6,9.5h-7.9v23.3h7.9c6.2,0,10.3-4.6,10.3-11.6C178.9,14.1,174.9,9.5,168.6,9.5z M167,15.2h1.6
c3.4,0,4,3.7,4,6c0,1.4-0.3,5.9-4,5.9H167V15.2z"/>
</g>
<g>
<path class="st5" d="M9.3,9.5H0.5v23.3h6.2V25h2.6c5.2,0,7.6-3.9,7.6-7.8C16.9,13.4,14.6,9.5,9.3,9.5z M6.7,15.2h2.1
c1.1,0,1.8,0.8,1.8,2.1c0,1-0.5,2.1-1.8,2.1H6.7V15.2z"/>
<path class="st5" d="M27.6,9.2c-5.9,0-9.9,4.8-9.9,12s4,12,9.9,12s9.9-4.8,9.9-12S33.5,9.2,27.6,9.2z M27.6,27.5
c-2.6,0-3.5-3.4-3.5-6.3s0.9-6.3,3.5-6.3c2.7,0,3.6,3.4,3.6,6.3S30.2,27.5,27.6,27.5z"/>
<path class="st5" d="M52.5,23.9c1.8-1,3.4-3.1,3.4-6.6c0-4.5-3.1-7.7-7.5-7.7h-8.9v23.3h6.2v-7.8h0.7l3.3,7.8h6.9L52.5,23.9z
M47.8,19.4h-2.1v-4.2h2.1c1.6,0,1.8,1.5,1.8,2.1C49.6,18.3,49.1,19.4,47.8,19.4z"/>
<polygon class="st5" points="70.4,9.5 70.4,19.8 64.9,9.5 59,9.5 59,32.8 65.2,32.8 65.2,22 71,32.8 76.7,32.8 76.7,9.5 "/>
</g>
</svg>

After

Width:  |  Height:  |  Size: 3.4 KiB

BIN
public/img/tags/anal/2.jpeg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 119 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 123 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 252 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 101 KiB

View File

@ -1,40 +1,40 @@
const upsert = require('../src/utils/upsert');
const tagPosters = [
['airtight', 1, 'Jynx Maze in "Pump My Ass Full of Cum 3" for Jules Jordan'],
['anal', 2, 'Sheena Shaw for Bang Bros'],
['anal-creampie', 0, 'Gina Valentina and Jane Wilde in "A Very Special Anniversary" for Tushy'],
['asian', 'poster', 'Vina Sky in "Young and Glamorous 10" for Jules Jordan'],
['ass-to-mouth', 'poster', 'Alysa Gap and Logan in "Anal Buffet 4" for Evil Angel'],
['bdsm', 0, 'Dani Daniels in "The Traning of Dani Daniels, Day 2" for The Training of O at Kink'],
['behind-the-scenes', 0, 'Janice Griffith in "Day With A Pornstar: Janice" for Brazzers'],
['blowbang', 'poster'],
['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'],
['bukkake', 'poster'],
['caucasian', 1, 'Sheena Shaw for Brazzers'],
['creampie', 'poster'],
['da-tp', 0, 'Natasha Teen in LegalPorno SZ2164'],
['deepthroat', 0, 'Chanel Grey in "Deepthroating Is Fun" for Throated'],
['double-anal', 5, 'Riley Reid in "The Gangbang of Riley Reid" for Jules Jordan'],
['double-anal', 2, 'Lana Rhoades in "Lana Rhoades Unleashed" for HardX'],
['double-penetration', 'poster', 'Mia Malkova in "DP!" for HardX'],
['double-vaginal', 'poster', 'Riley Reid in "Pizza That Ass" for Reid My Lips'],
['dv-tp', 'poster', 'Juelz Ventura in "Gangbanged 5" for Elegant Angel'],
['oral-creampie', 1, 'Keisha Grey in Brazzers House'],
['triple-anal', 'poster', 'Kristy Black in SZ1986 for LegalPorno'],
['airtight', 1, 'Jynx Maze in "Pump My Ass Full of Cum 3" for Jules Jordan'],
['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'],
['asian', 'poster', 'Vina Sky in "Young and Glamorous 10" for Jules Jordan'],
['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'],
['blowbang', 'poster'],
['bukkake', 'poster'],
['caucasian', 'poster'],
['creampie', 'poster'],
['ebony', 1, 'Sarah Banks for Brazzers'],
['facial', 'poster'],
['facefucking', '1', 'Carrie for Young Throats'],
['facial', 'poster'],
['gangbang', 'poster', 'Kristen Scott in "Interracial Gangbang!" for Jules Jordan'],
['gaping', 1, 'Vina Sky in "Vina Sky Does Anal" for HardX'],
['interracial', 'poster'],
['latina', 'poster'],
['mff', 0, 'Madison Ivy and Adriana Chechik in "Day With A Pornstar" for Brazzers'],
['mfm', 'poster'],
['oral-creampie', 1, 'Keisha Grey in Brazzers House'],
['orgy', 'poster'],
['schoolgirl', 1, 'Eliza Ibarra for Brazzers'],
['swallowing', 'poster'],
['tattoo', 'poster', 'Kali Roses in "Goes All In For Anal" for Hussie Pass'],
['trainbang', 'poster', 'Kali Roses in "Passing Me Around" for Blacked'],
['triple-anal', 'poster', 'Kristy Black in SZ1986 for LegalPorno'],
]
.map(([slug, filename, comment], index) => ({
tagSlug: slug,
@ -49,12 +49,15 @@ const tagPhotos = [
['airtight', 2, 'Dakota Skye in "Dakota Goes Nuts" for ArchAngel'],
['airtight', 3, 'Anita Bellini in "Triple Dick Gangbang" for Hands On Hardcore (DDF Network)'],
['anal', 'poster', 'Jynx Maze in "Anal Buffet 6" for Evil Angel'],
['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'],
['anal', 0],
['caucasian', 'poster'],
['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'],
['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'],
['da-tp', 3, 'Evelina Darling in GIO294'],
['da-tp', 4, 'Ninel Mojado aka Mira Cuckold in GIO063 for LegalPorno'],
['double-anal', 2, 'Lana Rhoades in "Gangbang Me 3" for HardX'],
['double-anal', 6, 'Sheena Shaw in "Ass Worship 14" for Jules Jordan'],
['double-anal', 5, 'Riley Reid in "The Gangbang of Riley Reid" for Jules Jordan'],
['double-anal', 'poster', 'Haley Reed in "Young Hot Ass" for Evil Angel'],
['double-anal', 0, 'Nicole Black doing double anal during a gangbang in GIO971 for LegalPorno'],
['double-anal', 1, 'Ria Sunn in SZ1801 for LegalPorno'],

View File

@ -24,18 +24,19 @@ async function init() {
}
const updateBaseScenes = (argv.scrape || argv.sites || argv.networks) && await fetchUpdates();
const deepScenes = await fetchScenes([...(argv.scenes || []), ...(updateBaseScenes || [])]);
const deepScenes = argv.deep && await fetchScenes([...(argv.scenes || []), ...(updateBaseScenes || [])]);
console.log(deepScenes.map(scene => scene.movie));
const sceneMovies = deepScenes && argv.sceneMovies && deepScenes.map(scene => scene.movie).filter(Boolean);
const deepMovies = await fetchMovies([...(argv.movies || []), ...(sceneMovies || [])]);
const argvDeepMovies = argv.movies && await fetchMovies(argv.movies);
if (argv.save) {
await storeReleases([
...(deepScenes || []),
...(deepMovies || []),
]);
await storeReleases([
...(deepScenes || []),
...(argvDeepMovies || []),
]);
// await storeReleaseActors(updateReleases);
// await storeReleaseActors(updateReleases);
}
knex.destroy();
}

View File

@ -29,21 +29,27 @@ const { argv } = yargs
type: 'array',
alias: 'actor',
})
.option('with-scenes', {
describe: 'Fetch all scenes for an actor or movie',
.option('actor-scenes', {
describe: 'Fetch all scenes for an actor',
type: 'boolean',
alias: 'with-releases',
default: false,
})
.option('with-movies', {
.option('movie-scenes', {
describe: 'Fetch all scenes for a movie',
type: 'boolean',
alias: 'with-releases',
default: false,
})
.option('scene-movies', {
describe: 'Fetch movies for scenes',
type: 'boolean',
default: true,
})
.option('with-profiles', {
.option('profiles', {
describe: 'Scrape profiles for new actors after fetching scenes',
type: 'boolean',
alias: 'with-actors',
alias: 'bios',
default: false,
})
.option('scene', {

View File

@ -99,7 +99,7 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') {
};
}
const scraper = scrapers.releases[site.slug];
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
if (!scraper) {
logger.warn(`Could not find scraper for ${baseRelease.url}`);
@ -124,6 +124,7 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') {
};
if (scrapedRelease && baseRelease?.tags) {
// accumulate all available tags
mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags);
}

View File

@ -141,7 +141,9 @@ async function fetchLatest(site, page = 1) {
}
async function fetchScene(url, site) {
const res = await bhttp.get(`https://ddfnetwork.com${new URL(url).pathname}`);
// DDF's main site moved to Porn World
// const res = await bhttp.get(`https://ddfnetwork.com${new URL(url).pathname}`);
const res = await bhttp.get(url);
return scrapeScene(res.body.toString(), url, site);
}

View File

@ -2,10 +2,10 @@
const config = require('config');
const argv = require('./argv');
const logger = require('./logger')(__filename);
const knex = require('./knex');
const slugify = require('./utils/slugify');
const { associateTags } = require('./tags');
function curateReleaseEntry(release, batchId, existingRelease) {
const slug = slugify(release.title, '-', {
@ -34,7 +34,7 @@ function curateReleaseEntry(release, batchId, existingRelease) {
updated_batch_id: batchId,
};
if (!existingRelease) {
if (!existingRelease && !release.id) {
curatedRelease.created_batch_id = batchId;
}
@ -60,7 +60,7 @@ async function attachChannelSites(releases) {
};
}
logger.error(`Unable to match channel '${release.channel.slug || release.channel}' from generic URL ${release.url}`);
logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`);
return null;
})
@ -93,15 +93,41 @@ async function attachStudios(releases) {
return releasesWithStudio;
}
function attachReleaseIds(releases, storedReleases) {
const storedReleaseIdsBySiteIdAndEntryId = storedReleases.reduce((acc, release) => {
if (!acc[release.site_id]) acc[release.site_id] = {};
acc[release.site_id][release.entry_id] = release.id;
return acc;
}, {});
const releasesWithId = releases.map(release => ({
...release,
id: storedReleaseIdsBySiteIdAndEntryId[release.site.id][release.entryId],
}));
return releasesWithId;
}
async function extractUniqueReleases(releases) {
const duplicateReleaseEntries = await knex('releases')
.whereIn(['entry_id', 'site_id'], releases.map(release => [release.entryId, release.site.id]));
const duplicateReleaseEntryKeys = new Set(duplicateReleaseEntries.map(releaseEntry => `${releaseEntry.site_id}_${releaseEntry.entry_id}`));
const duplicateReleases = releases.filter(release => duplicateReleaseEntryKeys.has(`${release.site.id}_${release.entryId}`));
const uniqueReleases = releases.filter(release => !duplicateReleaseEntryKeys.has(`${release.site.id}_${release.entryId}`));
const duplicateReleasesBySiteIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => {
if (!acc[release.site_id]) acc[release.site_id] = {};
acc[release.site_id][release.entry_id] = true;
return { duplicateReleases, uniqueReleases };
return acc;
}, {});
const duplicateReleases = releases.filter(release => duplicateReleasesBySiteIdAndEntryId[release.site.id]?.[release.entryId]);
const uniqueReleases = releases.filter(release => !duplicateReleasesBySiteIdAndEntryId[release.site.id]?.[release.entryId]);
return {
uniqueReleases,
duplicateReleases,
duplicateReleaseEntries,
};
}
async function storeReleases(releases) {
@ -111,19 +137,19 @@ async function storeReleases(releases) {
const releasesWithStudios = await attachStudios(releasesWithSites);
// uniqueness is site ID + entry ID, filter uniques after adding sites
const { uniqueReleases, duplicateReleases } = await extractUniqueReleases(releasesWithStudios);
const { uniqueReleases, duplicateReleaseEntries } = await extractUniqueReleases(releasesWithStudios);
console.log(argv.redownload, duplicateReleases);
const curatedNewReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId));
const curatedReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId));
const storedReleases = await knex('releases').insert(curatedReleaseEntries).returning('*');
const storedReleases = await knex('releases').insert(curatedNewReleaseEntries).returning('*');
// TODO: update duplicate releases
if (Array.isArray(storedReleases)) {
return storedReleases;
}
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];
const releasesWithId = attachReleaseIds(releases, [].concat(storedReleaseEntries, duplicateReleaseEntries));
// nothing inserted
return [];
await associateTags(releasesWithId);
return releasesWithId;
}
module.exports = {

110
src/tags-legacy.js Normal file
View File

@ -0,0 +1,110 @@
'use strict';
const logger = require('./logger')(__filename);
const knex = require('./knex');
const whereOr = require('./utils/where-or');
async function curateTag(tag) {
const [aliases, media] = await Promise.all([
knex('tags').where({ alias_for: tag.id }),
knex('media')
.where('domain', 'tags')
.andWhere('target_id', tag.id)
.orderBy('index'),
]);
return {
id: tag.id,
name: tag.name,
slug: tag.slug,
description: tag.description,
poster: media.find(photo => photo.role === 'poster'),
photos: media.filter(photo => photo.role === 'photo'),
group: {
id: tag.group_id,
name: tag.group_name,
description: tag.group_description,
slug: tag.group_slug,
},
aliases: aliases.map(({ name }) => name),
};
}
function curateTags(tags) {
return Promise.all(tags.map(async tag => curateTag(tag)));
}
async function matchTags(rawTags) {
const filteredTags = rawTags.filter(Boolean);
const tags = filteredTags
.concat(filteredTags.map(tag => tag.toLowerCase()))
.concat(filteredTags.map(tag => tag.toUpperCase()));
const tagEntries = await knex('tags')
.pluck('aliases.id')
.whereIn('tags.name', tags)
.leftJoin('tags as aliases', function join() {
this
.on('tags.alias_for', 'aliases.id')
.orOn('tags.id', 'aliases.id');
})
.where(function where() {
this
.whereNull('tags.alias_for')
.orWhereNull('aliases.alias_for');
})
.groupBy('aliases.id');
return tagEntries;
}
async function associateTags(release, releaseId) {
const siteTags = release.site?.tags?.filter(tag => tag.inherit === true).map(tag => tag.id) || [];
const rawReleaseTags = release.tags?.filter(Boolean) || [];
const releaseTags = rawReleaseTags.some(tag => typeof tag === 'string')
? await matchTags(release.tags) // scraper returned raw tags
: rawReleaseTags; // tags already matched by (outdated) scraper
const tags = Array.from(new Set(releaseTags.concat(siteTags)));
if (tags.length === 0) {
logger.info(`No tags available for (${release.site.name}, ${releaseId}) "${release.title}"`);
return;
}
const associationEntries = await knex('releases_tags')
.where('release_id', releaseId)
.whereIn('tag_id', tags);
const existingAssociations = new Set(associationEntries.map(association => association.tag_id));
const newAssociations = tags.filter(tagId => !existingAssociations.has(tagId));
await knex('releases_tags').insert(newAssociations.map(tagId => ({
tag_id: tagId,
release_id: releaseId,
})));
}
async function fetchTags(queryObject, groupsQueryObject, limit = 100) {
const tags = await knex('tags')
.where(builder => whereOr(queryObject, 'tags', builder))
.orWhere(builder => whereOr(groupsQueryObject, 'tags_groups', builder))
.andWhere({ 'tags.alias_for': null })
.select(
'tags.*',
'tags_groups.id as group_id', 'tags_groups.name as group_name', 'tags_groups.slug as group_slug', 'tags_groups.description as groups_description',
)
.leftJoin('tags_groups', 'tags.group_id', 'tags_groups.id')
.orderBy('name')
.limit(limit);
return curateTags(tags);
}
module.exports = {
associateTags,
fetchTags,
matchTags,
};

View File

@ -1,110 +1,103 @@
'use strict';
const logger = require('./logger')(__filename);
const knex = require('./knex');
const whereOr = require('./utils/where-or');
const slugify = require('./utils/slugify');
async function curateTag(tag) {
const [aliases, media] = await Promise.all([
knex('tags').where({ alias_for: tag.id }),
knex('media')
.where('domain', 'tags')
.andWhere('target_id', tag.id)
.orderBy('index'),
]);
async function matchReleaseTags(releases) {
const rawTags = releases
.map(release => release.tags).flat()
.filter(Boolean);
return {
id: tag.id,
name: tag.name,
slug: tag.slug,
description: tag.description,
poster: media.find(photo => photo.role === 'poster'),
photos: media.filter(photo => photo.role === 'photo'),
group: {
id: tag.group_id,
name: tag.group_name,
description: tag.group_description,
slug: tag.group_slug,
},
aliases: aliases.map(({ name }) => name),
};
}
function curateTags(tags) {
return Promise.all(tags.map(async tag => curateTag(tag)));
}
async function matchTags(rawTags) {
const filteredTags = rawTags.filter(Boolean);
const tags = filteredTags
.concat(filteredTags.map(tag => tag.toLowerCase()))
.concat(filteredTags.map(tag => tag.toUpperCase()));
const casedTags = Array.from(new Set(
rawTags
.concat(rawTags.map(tag => tag.toLowerCase()))
.concat(rawTags.map(tag => tag.toUpperCase())),
));
const tagEntries = await knex('tags')
.pluck('aliases.id')
.whereIn('tags.name', tags)
.leftJoin('tags as aliases', function join() {
this
.on('tags.alias_for', 'aliases.id')
.orOn('tags.id', 'aliases.id');
})
.where(function where() {
this
.whereNull('tags.alias_for')
.orWhereNull('aliases.alias_for');
})
.groupBy('aliases.id');
.select('tags.id', 'tags.name', 'tags.alias_for')
.whereIn('tags.name', casedTags);
return tagEntries;
const tagIdsBySlug = tagEntries
.reduce((acc, tag) => ({
...acc,
[slugify(tag.name)]: tag.alias_for || tag.id,
}), {});
return tagIdsBySlug;
}
async function associateTags(release, releaseId) {
const siteTags = release.site?.tags?.filter(tag => tag.inherit === true).map(tag => tag.id) || [];
async function getSiteTags(releases) {
const siteIds = releases.map(release => release.site.id);
const siteTags = await knex('sites_tags').whereIn('site_id', siteIds);
const rawReleaseTags = release.tags?.filter(Boolean) || [];
const releaseTags = rawReleaseTags.some(tag => typeof tag === 'string')
? await matchTags(release.tags) // scraper returned raw tags
: rawReleaseTags; // tags already matched by (outdated) scraper
const siteTagIdsBySiteId = siteTags.reduce((acc, siteTag) => {
if (!acc[siteTag.site_id]) {
acc[siteTag.site_id] = [];
}
const tags = Array.from(new Set(releaseTags.concat(siteTags)));
acc[siteTag.site_id].push(siteTag.tag_id);
if (tags.length === 0) {
logger.info(`No tags available for (${release.site.name}, ${releaseId}) "${release.title}"`);
return;
}
return acc;
}, {});
const associationEntries = await knex('releases_tags')
.where('release_id', releaseId)
.whereIn('tag_id', tags);
const existingAssociations = new Set(associationEntries.map(association => association.tag_id));
const newAssociations = tags.filter(tagId => !existingAssociations.has(tagId));
await knex('releases_tags').insert(newAssociations.map(tagId => ({
tag_id: tagId,
release_id: releaseId,
})));
return siteTagIdsBySiteId;
}
async function fetchTags(queryObject, groupsQueryObject, limit = 100) {
const tags = await knex('tags')
.where(builder => whereOr(queryObject, 'tags', builder))
.orWhere(builder => whereOr(groupsQueryObject, 'tags_groups', builder))
.andWhere({ 'tags.alias_for': null })
.select(
'tags.*',
'tags_groups.id as group_id', 'tags_groups.name as group_name', 'tags_groups.slug as group_slug', 'tags_groups.description as groups_description',
)
.leftJoin('tags_groups', 'tags.group_id', 'tags_groups.id')
.orderBy('name')
.limit(limit);
function buildReleaseTagAssociations(releases, tagIdsBySlug, siteTagIdsBySiteId) {
const tagAssociations = releases
.map((release) => {
const siteTagIds = siteTagIdsBySiteId[release.site.id];
return curateTags(tags);
const releaseTagIds = release.tags.every(tag => typeof tag === 'number')
? release.tags // obsolete scraper returned pre-matched tags
: release.tags.map(tag => tagIdsBySlug[slugify(tag)]);
return Array.from(new Set(
// filter duplicates and empties
releaseTagIds
.concat(siteTagIds)
.filter(Boolean),
))
.map(tagId => ({
release_id: release.id,
tag_id: tagId,
}));
})
.flat();
return tagAssociations;
}
async function extractUniqueAssociations(tagAssociations) {
const duplicateAssociations = await knex('releases_tags').whereIn(['release_id', 'tag_id'], tagAssociations.map(association => [association.release_id, association.tag_id]));
const duplicateAssociationsByReleaseIdAndTagId = duplicateAssociations.reduce((acc, association) => {
if (!acc[association.release_id]) {
acc[association.release_id] = {};
}
acc[association.release_id][association.tag_id] = true;
return acc;
}, {});
const uniqueAssociations = tagAssociations
.filter(association => !duplicateAssociationsByReleaseIdAndTagId[association.release_id]?.[association.tag_id]);
return uniqueAssociations;
}
async function associateTags(releases) {
const tagIdsBySlug = await matchReleaseTags(releases);
const siteTagIdsBySiteId = await getSiteTags(releases);
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, siteTagIdsBySiteId);
const uniqueAssociations = extractUniqueAssociations(tagAssociations);
await knex('releases_tags').insert(uniqueAssociations);
}
module.exports = {
associateTags,
fetchTags,
matchTags,
};

View File

@ -34,7 +34,7 @@ async function extractUniqueReleases(latestReleases, accReleases) {
// add entry IDs of accumulated releases to prevent an infinite scrape loop
// when one page contains the same release as the previous
const duplicateReleaseIdentifiers = duplicateReleases
const duplicateReleasesSiteIdAndEntryIds = duplicateReleases
.concat(accReleases)
.reduce((acc, release) => {
const siteId = release.site_id || release.site.id;
@ -47,7 +47,7 @@ async function extractUniqueReleases(latestReleases, accReleases) {
}, {});
const uniqueReleases = latestReleases
.filter(release => !duplicateReleaseIdentifiers[release.site.id]?.[release.entryId]);
.filter(release => !duplicateReleasesSiteIdAndEntryIds[release.site.id]?.[release.entryId]);
return uniqueReleases;
}

View File

@ -52,11 +52,11 @@ queue.define('http', async ({
const reqOptions = {
headers: {
...(options.defaultHeaders !== false && defaultHeaders),
...headers,
...defaultHeaders,
},
...options,
...defaultOptions,
...options,
...(options.timeout && { responseTimeout: options.timeout }),
};

View File

@ -288,9 +288,7 @@ function extractAll(htmlValue, selector) {
}
async function get(urlValue, selector, headers, options, queryAll = false) {
const res = await http.get(urlValue, {
headers,
});
const res = await http.get(urlValue, headers);
if (res.statusCode === 200) {
const item = queryAll

View File

@ -4,6 +4,10 @@ function slugify(string, delimiter = '-', {
encode = false,
limit = 1000,
} = {}) {
if (!string) {
return '';
}
const slugComponents = string.trim().toLowerCase().match(/\w+/g);
if (!slugComponents) {