Improved release storage module. Added new tags module. Added movie scraping.
|
@ -352,7 +352,7 @@ exports.up = knex => Promise.resolve()
|
|||
|
||||
table.string('shoot_id');
|
||||
table.string('entry_id');
|
||||
table.unique(['site_id', 'entry_id']);
|
||||
table.unique(['site_id', 'entry_id', 'type']);
|
||||
|
||||
table.string('url', 1000);
|
||||
table.string('title');
|
||||
|
|
Before Width: | Height: | Size: 37 KiB After Width: | Height: | Size: 40 KiB |
Before Width: | Height: | Size: 25 KiB After Width: | Height: | Size: 27 KiB |
After Width: | Height: | Size: 27 KiB |
After Width: | Height: | Size: 44 KiB |
|
@ -0,0 +1,58 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<!-- Generator: Adobe Illustrator 23.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0) -->
|
||||
<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
|
||||
viewBox="0 0 180 42" style="enable-background:new 0 0 180 42;" xml:space="preserve">
|
||||
<style type="text/css">
|
||||
.st0{fill:url(#SVGID_1_);}
|
||||
.st1{fill:url(#SVGID_2_);}
|
||||
.st2{fill:url(#SVGID_3_);}
|
||||
.st3{fill:url(#SVGID_4_);}
|
||||
.st4{fill:url(#SVGID_5_);}
|
||||
.st5{fill:#FFFFFF;}
|
||||
</style>
|
||||
<g>
|
||||
|
||||
<linearGradient id="SVGID_1_" gradientUnits="userSpaceOnUse" x1="92.25" y1="10.6716" x2="92.25" y2="35.0304" gradientTransform="matrix(1 0 0 -1 0 44)">
|
||||
<stop offset="0" style="stop-color:#E93F3A"/>
|
||||
<stop offset="1" style="stop-color:#F48B40"/>
|
||||
</linearGradient>
|
||||
<polygon class="st0" points="98.9,9.5 97,20.4 94.7,9.5 89.8,9.5 87.6,20.4 85.6,9.5 78.9,9.5 84.3,32.8 90.3,32.8 92.2,22.7
|
||||
94.2,32.8 100.2,32.8 105.6,9.5 "/>
|
||||
|
||||
<linearGradient id="SVGID_2_" gradientUnits="userSpaceOnUse" x1="115" y1="10.6716" x2="115" y2="35.0304" gradientTransform="matrix(1 0 0 -1 0 44)">
|
||||
<stop offset="0" style="stop-color:#E93F3A"/>
|
||||
<stop offset="1" style="stop-color:#F48B40"/>
|
||||
</linearGradient>
|
||||
<path class="st1" d="M115,9.2c-5.9,0-9.9,4.8-9.9,12s4,12,9.9,12s9.9-4.8,9.9-12S121,9.2,115,9.2z M115,27.5
|
||||
c-2.6,0-3.5-3.4-3.5-6.3s0.9-6.3,3.5-6.3c2.7,0,3.6,3.4,3.6,6.3S117.7,27.5,115,27.5z"/>
|
||||
|
||||
<linearGradient id="SVGID_3_" gradientUnits="userSpaceOnUse" x1="135.55" y1="10.6716" x2="135.55" y2="35.0304" gradientTransform="matrix(1 0 0 -1 0 44)">
|
||||
<stop offset="0" style="stop-color:#E93F3A"/>
|
||||
<stop offset="1" style="stop-color:#F48B40"/>
|
||||
</linearGradient>
|
||||
<path class="st2" d="M140,23.9c1.8-1,3.4-3.1,3.4-6.6c0-4.5-3.1-7.7-7.5-7.7H127v23.3h6.2v-7.8h0.7l3.3,7.8h6.9L140,23.9z
|
||||
M135.3,19.4h-2.1v-4.2h2.1c1.6,0,1.8,1.5,1.8,2.1C137.1,18.3,136.6,19.4,135.3,19.4z"/>
|
||||
|
||||
<linearGradient id="SVGID_4_" gradientUnits="userSpaceOnUse" x1="152.3" y1="10.6716" x2="152.3" y2="35.0304" gradientTransform="matrix(1 0 0 -1 0 44)">
|
||||
<stop offset="0" style="stop-color:#E93F3A"/>
|
||||
<stop offset="1" style="stop-color:#F48B40"/>
|
||||
</linearGradient>
|
||||
<polygon class="st3" points="151.8,27.1 151.8,9.5 145.6,9.5 145.6,32.8 159,32.8 159,27.1 "/>
|
||||
|
||||
<linearGradient id="SVGID_5_" gradientUnits="userSpaceOnUse" x1="169.8" y1="10.6716" x2="169.8" y2="35.0304" gradientTransform="matrix(1 0 0 -1 0 44)">
|
||||
<stop offset="0" style="stop-color:#E93F3A"/>
|
||||
<stop offset="1" style="stop-color:#F48B40"/>
|
||||
</linearGradient>
|
||||
<path class="st4" d="M168.6,9.5h-7.9v23.3h7.9c6.2,0,10.3-4.6,10.3-11.6C178.9,14.1,174.9,9.5,168.6,9.5z M167,15.2h1.6
|
||||
c3.4,0,4,3.7,4,6c0,1.4-0.3,5.9-4,5.9H167V15.2z"/>
|
||||
</g>
|
||||
<g>
|
||||
<path class="st5" d="M9.3,9.5H0.5v23.3h6.2V25h2.6c5.2,0,7.6-3.9,7.6-7.8C16.9,13.4,14.6,9.5,9.3,9.5z M6.7,15.2h2.1
|
||||
c1.1,0,1.8,0.8,1.8,2.1c0,1-0.5,2.1-1.8,2.1H6.7V15.2z"/>
|
||||
<path class="st5" d="M27.6,9.2c-5.9,0-9.9,4.8-9.9,12s4,12,9.9,12s9.9-4.8,9.9-12S33.5,9.2,27.6,9.2z M27.6,27.5
|
||||
c-2.6,0-3.5-3.4-3.5-6.3s0.9-6.3,3.5-6.3c2.7,0,3.6,3.4,3.6,6.3S30.2,27.5,27.6,27.5z"/>
|
||||
<path class="st5" d="M52.5,23.9c1.8-1,3.4-3.1,3.4-6.6c0-4.5-3.1-7.7-7.5-7.7h-8.9v23.3h6.2v-7.8h0.7l3.3,7.8h6.9L52.5,23.9z
|
||||
M47.8,19.4h-2.1v-4.2h2.1c1.6,0,1.8,1.5,1.8,2.1C49.6,18.3,49.1,19.4,47.8,19.4z"/>
|
||||
<polygon class="st5" points="70.4,9.5 70.4,19.8 64.9,9.5 59,9.5 59,32.8 65.2,32.8 65.2,22 71,32.8 76.7,32.8 76.7,9.5 "/>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 3.4 KiB |
After Width: | Height: | Size: 1.7 MiB |
After Width: | Height: | Size: 119 KiB |
After Width: | Height: | Size: 1.9 MiB |
After Width: | Height: | Size: 123 KiB |
After Width: | Height: | Size: 252 KiB |
After Width: | Height: | Size: 101 KiB |
|
@ -1,40 +1,40 @@
|
|||
const upsert = require('../src/utils/upsert');
|
||||
|
||||
const tagPosters = [
|
||||
['airtight', 1, 'Jynx Maze in "Pump My Ass Full of Cum 3" for Jules Jordan'],
|
||||
['anal', 2, 'Sheena Shaw for Bang Bros'],
|
||||
['anal-creampie', 0, 'Gina Valentina and Jane Wilde in "A Very Special Anniversary" for Tushy'],
|
||||
['asian', 'poster', 'Vina Sky in "Young and Glamorous 10" for Jules Jordan'],
|
||||
['ass-to-mouth', 'poster', 'Alysa Gap and Logan in "Anal Buffet 4" for Evil Angel'],
|
||||
['bdsm', 0, 'Dani Daniels in "The Traning of Dani Daniels, Day 2" for The Training of O at Kink'],
|
||||
['behind-the-scenes', 0, 'Janice Griffith in "Day With A Pornstar: Janice" for Brazzers'],
|
||||
['blowbang', 'poster'],
|
||||
['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'],
|
||||
['bukkake', 'poster'],
|
||||
['caucasian', 1, 'Sheena Shaw for Brazzers'],
|
||||
['creampie', 'poster'],
|
||||
['da-tp', 0, 'Natasha Teen in LegalPorno SZ2164'],
|
||||
['deepthroat', 0, 'Chanel Grey in "Deepthroating Is Fun" for Throated'],
|
||||
['double-anal', 5, 'Riley Reid in "The Gangbang of Riley Reid" for Jules Jordan'],
|
||||
['double-anal', 2, 'Lana Rhoades in "Lana Rhoades Unleashed" for HardX'],
|
||||
['double-penetration', 'poster', 'Mia Malkova in "DP!" for HardX'],
|
||||
['double-vaginal', 'poster', 'Riley Reid in "Pizza That Ass" for Reid My Lips'],
|
||||
['dv-tp', 'poster', 'Juelz Ventura in "Gangbanged 5" for Elegant Angel'],
|
||||
['oral-creampie', 1, 'Keisha Grey in Brazzers House'],
|
||||
['triple-anal', 'poster', 'Kristy Black in SZ1986 for LegalPorno'],
|
||||
['airtight', 1, 'Jynx Maze in "Pump My Ass Full of Cum 3" for Jules Jordan'],
|
||||
['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'],
|
||||
['asian', 'poster', 'Vina Sky in "Young and Glamorous 10" for Jules Jordan'],
|
||||
['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'],
|
||||
['blowbang', 'poster'],
|
||||
['bukkake', 'poster'],
|
||||
['caucasian', 'poster'],
|
||||
['creampie', 'poster'],
|
||||
['ebony', 1, 'Sarah Banks for Brazzers'],
|
||||
['facial', 'poster'],
|
||||
['facefucking', '1', 'Carrie for Young Throats'],
|
||||
['facial', 'poster'],
|
||||
['gangbang', 'poster', 'Kristen Scott in "Interracial Gangbang!" for Jules Jordan'],
|
||||
['gaping', 1, 'Vina Sky in "Vina Sky Does Anal" for HardX'],
|
||||
['interracial', 'poster'],
|
||||
['latina', 'poster'],
|
||||
['mff', 0, 'Madison Ivy and Adriana Chechik in "Day With A Pornstar" for Brazzers'],
|
||||
['mfm', 'poster'],
|
||||
['oral-creampie', 1, 'Keisha Grey in Brazzers House'],
|
||||
['orgy', 'poster'],
|
||||
['schoolgirl', 1, 'Eliza Ibarra for Brazzers'],
|
||||
['swallowing', 'poster'],
|
||||
['tattoo', 'poster', 'Kali Roses in "Goes All In For Anal" for Hussie Pass'],
|
||||
['trainbang', 'poster', 'Kali Roses in "Passing Me Around" for Blacked'],
|
||||
['triple-anal', 'poster', 'Kristy Black in SZ1986 for LegalPorno'],
|
||||
]
|
||||
.map(([slug, filename, comment], index) => ({
|
||||
tagSlug: slug,
|
||||
|
@ -49,12 +49,15 @@ const tagPhotos = [
|
|||
['airtight', 2, 'Dakota Skye in "Dakota Goes Nuts" for ArchAngel'],
|
||||
['airtight', 3, 'Anita Bellini in "Triple Dick Gangbang" for Hands On Hardcore (DDF Network)'],
|
||||
['anal', 'poster', 'Jynx Maze in "Anal Buffet 6" for Evil Angel'],
|
||||
['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'],
|
||||
['anal', 0],
|
||||
['caucasian', 'poster'],
|
||||
['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'],
|
||||
['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'],
|
||||
['da-tp', 3, 'Evelina Darling in GIO294'],
|
||||
['da-tp', 4, 'Ninel Mojado aka Mira Cuckold in GIO063 for LegalPorno'],
|
||||
['double-anal', 2, 'Lana Rhoades in "Gangbang Me 3" for HardX'],
|
||||
['double-anal', 6, 'Sheena Shaw in "Ass Worship 14" for Jules Jordan'],
|
||||
['double-anal', 5, 'Riley Reid in "The Gangbang of Riley Reid" for Jules Jordan'],
|
||||
['double-anal', 'poster', 'Haley Reed in "Young Hot Ass" for Evil Angel'],
|
||||
['double-anal', 0, 'Nicole Black doing double anal during a gangbang in GIO971 for LegalPorno'],
|
||||
['double-anal', 1, 'Ria Sunn in SZ1801 for LegalPorno'],
|
||||
|
|
11
src/app.js
|
@ -24,18 +24,19 @@ async function init() {
|
|||
}
|
||||
|
||||
const updateBaseScenes = (argv.scrape || argv.sites || argv.networks) && await fetchUpdates();
|
||||
const deepScenes = await fetchScenes([...(argv.scenes || []), ...(updateBaseScenes || [])]);
|
||||
const deepScenes = argv.deep && await fetchScenes([...(argv.scenes || []), ...(updateBaseScenes || [])]);
|
||||
|
||||
console.log(deepScenes.map(scene => scene.movie));
|
||||
|
||||
const argvDeepMovies = argv.movies && await fetchMovies(argv.movies);
|
||||
const sceneMovies = deepScenes && argv.sceneMovies && deepScenes.map(scene => scene.movie).filter(Boolean);
|
||||
const deepMovies = await fetchMovies([...(argv.movies || []), ...(sceneMovies || [])]);
|
||||
|
||||
if (argv.save) {
|
||||
await storeReleases([
|
||||
...(deepScenes || []),
|
||||
...(argvDeepMovies || []),
|
||||
...(deepMovies || []),
|
||||
]);
|
||||
|
||||
// await storeReleaseActors(updateReleases);
|
||||
}
|
||||
|
||||
knex.destroy();
|
||||
}
|
||||
|
|
16
src/argv.js
|
@ -29,21 +29,27 @@ const { argv } = yargs
|
|||
type: 'array',
|
||||
alias: 'actor',
|
||||
})
|
||||
.option('with-scenes', {
|
||||
describe: 'Fetch all scenes for an actor or movie',
|
||||
.option('actor-scenes', {
|
||||
describe: 'Fetch all scenes for an actor',
|
||||
type: 'boolean',
|
||||
alias: 'with-releases',
|
||||
default: false,
|
||||
})
|
||||
.option('with-movies', {
|
||||
.option('movie-scenes', {
|
||||
describe: 'Fetch all scenes for a movie',
|
||||
type: 'boolean',
|
||||
alias: 'with-releases',
|
||||
default: false,
|
||||
})
|
||||
.option('scene-movies', {
|
||||
describe: 'Fetch movies for scenes',
|
||||
type: 'boolean',
|
||||
default: true,
|
||||
})
|
||||
.option('with-profiles', {
|
||||
.option('profiles', {
|
||||
describe: 'Scrape profiles for new actors after fetching scenes',
|
||||
type: 'boolean',
|
||||
alias: 'with-actors',
|
||||
alias: 'bios',
|
||||
default: false,
|
||||
})
|
||||
.option('scene', {
|
||||
|
|
|
@ -99,7 +99,7 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') {
|
|||
};
|
||||
}
|
||||
|
||||
const scraper = scrapers.releases[site.slug];
|
||||
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
|
||||
|
||||
if (!scraper) {
|
||||
logger.warn(`Could not find scraper for ${baseRelease.url}`);
|
||||
|
@ -124,6 +124,7 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') {
|
|||
};
|
||||
|
||||
if (scrapedRelease && baseRelease?.tags) {
|
||||
// accumulate all available tags
|
||||
mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags);
|
||||
}
|
||||
|
||||
|
|
|
@ -141,7 +141,9 @@ async function fetchLatest(site, page = 1) {
|
|||
}
|
||||
|
||||
async function fetchScene(url, site) {
|
||||
const res = await bhttp.get(`https://ddfnetwork.com${new URL(url).pathname}`);
|
||||
// DDF's main site moved to Porn World
|
||||
// const res = await bhttp.get(`https://ddfnetwork.com${new URL(url).pathname}`);
|
||||
const res = await bhttp.get(url);
|
||||
|
||||
return scrapeScene(res.body.toString(), url, site);
|
||||
}
|
||||
|
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
const config = require('config');
|
||||
|
||||
const argv = require('./argv');
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const slugify = require('./utils/slugify');
|
||||
const { associateTags } = require('./tags');
|
||||
|
||||
function curateReleaseEntry(release, batchId, existingRelease) {
|
||||
const slug = slugify(release.title, '-', {
|
||||
|
@ -34,7 +34,7 @@ function curateReleaseEntry(release, batchId, existingRelease) {
|
|||
updated_batch_id: batchId,
|
||||
};
|
||||
|
||||
if (!existingRelease) {
|
||||
if (!existingRelease && !release.id) {
|
||||
curatedRelease.created_batch_id = batchId;
|
||||
}
|
||||
|
||||
|
@ -60,7 +60,7 @@ async function attachChannelSites(releases) {
|
|||
};
|
||||
}
|
||||
|
||||
logger.error(`Unable to match channel '${release.channel.slug || release.channel}' from generic URL ${release.url}`);
|
||||
logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`);
|
||||
|
||||
return null;
|
||||
})
|
||||
|
@ -93,15 +93,41 @@ async function attachStudios(releases) {
|
|||
return releasesWithStudio;
|
||||
}
|
||||
|
||||
function attachReleaseIds(releases, storedReleases) {
|
||||
const storedReleaseIdsBySiteIdAndEntryId = storedReleases.reduce((acc, release) => {
|
||||
if (!acc[release.site_id]) acc[release.site_id] = {};
|
||||
acc[release.site_id][release.entry_id] = release.id;
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
const releasesWithId = releases.map(release => ({
|
||||
...release,
|
||||
id: storedReleaseIdsBySiteIdAndEntryId[release.site.id][release.entryId],
|
||||
}));
|
||||
|
||||
return releasesWithId;
|
||||
}
|
||||
|
||||
async function extractUniqueReleases(releases) {
|
||||
const duplicateReleaseEntries = await knex('releases')
|
||||
.whereIn(['entry_id', 'site_id'], releases.map(release => [release.entryId, release.site.id]));
|
||||
|
||||
const duplicateReleaseEntryKeys = new Set(duplicateReleaseEntries.map(releaseEntry => `${releaseEntry.site_id}_${releaseEntry.entry_id}`));
|
||||
const duplicateReleases = releases.filter(release => duplicateReleaseEntryKeys.has(`${release.site.id}_${release.entryId}`));
|
||||
const uniqueReleases = releases.filter(release => !duplicateReleaseEntryKeys.has(`${release.site.id}_${release.entryId}`));
|
||||
const duplicateReleasesBySiteIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => {
|
||||
if (!acc[release.site_id]) acc[release.site_id] = {};
|
||||
acc[release.site_id][release.entry_id] = true;
|
||||
|
||||
return { duplicateReleases, uniqueReleases };
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
const duplicateReleases = releases.filter(release => duplicateReleasesBySiteIdAndEntryId[release.site.id]?.[release.entryId]);
|
||||
const uniqueReleases = releases.filter(release => !duplicateReleasesBySiteIdAndEntryId[release.site.id]?.[release.entryId]);
|
||||
|
||||
return {
|
||||
uniqueReleases,
|
||||
duplicateReleases,
|
||||
duplicateReleaseEntries,
|
||||
};
|
||||
}
|
||||
|
||||
async function storeReleases(releases) {
|
||||
|
@ -111,19 +137,19 @@ async function storeReleases(releases) {
|
|||
const releasesWithStudios = await attachStudios(releasesWithSites);
|
||||
|
||||
// uniqueness is site ID + entry ID, filter uniques after adding sites
|
||||
const { uniqueReleases, duplicateReleases } = await extractUniqueReleases(releasesWithStudios);
|
||||
const { uniqueReleases, duplicateReleaseEntries } = await extractUniqueReleases(releasesWithStudios);
|
||||
|
||||
console.log(argv.redownload, duplicateReleases);
|
||||
const curatedNewReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId));
|
||||
|
||||
const curatedReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId));
|
||||
const storedReleases = await knex('releases').insert(curatedReleaseEntries).returning('*');
|
||||
const storedReleases = await knex('releases').insert(curatedNewReleaseEntries).returning('*');
|
||||
// TODO: update duplicate releases
|
||||
|
||||
if (Array.isArray(storedReleases)) {
|
||||
return storedReleases;
|
||||
}
|
||||
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];
|
||||
const releasesWithId = attachReleaseIds(releases, [].concat(storedReleaseEntries, duplicateReleaseEntries));
|
||||
|
||||
// nothing inserted
|
||||
return [];
|
||||
await associateTags(releasesWithId);
|
||||
|
||||
return releasesWithId;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
'use strict';
|
||||
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const whereOr = require('./utils/where-or');
|
||||
|
||||
async function curateTag(tag) {
|
||||
const [aliases, media] = await Promise.all([
|
||||
knex('tags').where({ alias_for: tag.id }),
|
||||
knex('media')
|
||||
.where('domain', 'tags')
|
||||
.andWhere('target_id', tag.id)
|
||||
.orderBy('index'),
|
||||
]);
|
||||
|
||||
return {
|
||||
id: tag.id,
|
||||
name: tag.name,
|
||||
slug: tag.slug,
|
||||
description: tag.description,
|
||||
poster: media.find(photo => photo.role === 'poster'),
|
||||
photos: media.filter(photo => photo.role === 'photo'),
|
||||
group: {
|
||||
id: tag.group_id,
|
||||
name: tag.group_name,
|
||||
description: tag.group_description,
|
||||
slug: tag.group_slug,
|
||||
},
|
||||
aliases: aliases.map(({ name }) => name),
|
||||
};
|
||||
}
|
||||
|
||||
function curateTags(tags) {
|
||||
return Promise.all(tags.map(async tag => curateTag(tag)));
|
||||
}
|
||||
|
||||
async function matchTags(rawTags) {
|
||||
const filteredTags = rawTags.filter(Boolean);
|
||||
|
||||
const tags = filteredTags
|
||||
.concat(filteredTags.map(tag => tag.toLowerCase()))
|
||||
.concat(filteredTags.map(tag => tag.toUpperCase()));
|
||||
|
||||
const tagEntries = await knex('tags')
|
||||
.pluck('aliases.id')
|
||||
.whereIn('tags.name', tags)
|
||||
.leftJoin('tags as aliases', function join() {
|
||||
this
|
||||
.on('tags.alias_for', 'aliases.id')
|
||||
.orOn('tags.id', 'aliases.id');
|
||||
})
|
||||
.where(function where() {
|
||||
this
|
||||
.whereNull('tags.alias_for')
|
||||
.orWhereNull('aliases.alias_for');
|
||||
})
|
||||
.groupBy('aliases.id');
|
||||
|
||||
return tagEntries;
|
||||
}
|
||||
|
||||
async function associateTags(release, releaseId) {
|
||||
const siteTags = release.site?.tags?.filter(tag => tag.inherit === true).map(tag => tag.id) || [];
|
||||
|
||||
const rawReleaseTags = release.tags?.filter(Boolean) || [];
|
||||
const releaseTags = rawReleaseTags.some(tag => typeof tag === 'string')
|
||||
? await matchTags(release.tags) // scraper returned raw tags
|
||||
: rawReleaseTags; // tags already matched by (outdated) scraper
|
||||
|
||||
const tags = Array.from(new Set(releaseTags.concat(siteTags)));
|
||||
|
||||
if (tags.length === 0) {
|
||||
logger.info(`No tags available for (${release.site.name}, ${releaseId}) "${release.title}"`);
|
||||
return;
|
||||
}
|
||||
|
||||
const associationEntries = await knex('releases_tags')
|
||||
.where('release_id', releaseId)
|
||||
.whereIn('tag_id', tags);
|
||||
|
||||
const existingAssociations = new Set(associationEntries.map(association => association.tag_id));
|
||||
const newAssociations = tags.filter(tagId => !existingAssociations.has(tagId));
|
||||
|
||||
await knex('releases_tags').insert(newAssociations.map(tagId => ({
|
||||
tag_id: tagId,
|
||||
release_id: releaseId,
|
||||
})));
|
||||
}
|
||||
|
||||
async function fetchTags(queryObject, groupsQueryObject, limit = 100) {
|
||||
const tags = await knex('tags')
|
||||
.where(builder => whereOr(queryObject, 'tags', builder))
|
||||
.orWhere(builder => whereOr(groupsQueryObject, 'tags_groups', builder))
|
||||
.andWhere({ 'tags.alias_for': null })
|
||||
.select(
|
||||
'tags.*',
|
||||
'tags_groups.id as group_id', 'tags_groups.name as group_name', 'tags_groups.slug as group_slug', 'tags_groups.description as groups_description',
|
||||
)
|
||||
.leftJoin('tags_groups', 'tags.group_id', 'tags_groups.id')
|
||||
.orderBy('name')
|
||||
.limit(limit);
|
||||
|
||||
return curateTags(tags);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
associateTags,
|
||||
fetchTags,
|
||||
matchTags,
|
||||
};
|
165
src/tags.js
|
@ -1,110 +1,103 @@
|
|||
'use strict';
|
||||
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const whereOr = require('./utils/where-or');
|
||||
const slugify = require('./utils/slugify');
|
||||
|
||||
async function curateTag(tag) {
|
||||
const [aliases, media] = await Promise.all([
|
||||
knex('tags').where({ alias_for: tag.id }),
|
||||
knex('media')
|
||||
.where('domain', 'tags')
|
||||
.andWhere('target_id', tag.id)
|
||||
.orderBy('index'),
|
||||
]);
|
||||
async function matchReleaseTags(releases) {
|
||||
const rawTags = releases
|
||||
.map(release => release.tags).flat()
|
||||
.filter(Boolean);
|
||||
|
||||
return {
|
||||
id: tag.id,
|
||||
name: tag.name,
|
||||
slug: tag.slug,
|
||||
description: tag.description,
|
||||
poster: media.find(photo => photo.role === 'poster'),
|
||||
photos: media.filter(photo => photo.role === 'photo'),
|
||||
group: {
|
||||
id: tag.group_id,
|
||||
name: tag.group_name,
|
||||
description: tag.group_description,
|
||||
slug: tag.group_slug,
|
||||
},
|
||||
aliases: aliases.map(({ name }) => name),
|
||||
};
|
||||
}
|
||||
|
||||
function curateTags(tags) {
|
||||
return Promise.all(tags.map(async tag => curateTag(tag)));
|
||||
}
|
||||
|
||||
async function matchTags(rawTags) {
|
||||
const filteredTags = rawTags.filter(Boolean);
|
||||
|
||||
const tags = filteredTags
|
||||
.concat(filteredTags.map(tag => tag.toLowerCase()))
|
||||
.concat(filteredTags.map(tag => tag.toUpperCase()));
|
||||
const casedTags = Array.from(new Set(
|
||||
rawTags
|
||||
.concat(rawTags.map(tag => tag.toLowerCase()))
|
||||
.concat(rawTags.map(tag => tag.toUpperCase())),
|
||||
));
|
||||
|
||||
const tagEntries = await knex('tags')
|
||||
.pluck('aliases.id')
|
||||
.whereIn('tags.name', tags)
|
||||
.leftJoin('tags as aliases', function join() {
|
||||
this
|
||||
.on('tags.alias_for', 'aliases.id')
|
||||
.orOn('tags.id', 'aliases.id');
|
||||
})
|
||||
.where(function where() {
|
||||
this
|
||||
.whereNull('tags.alias_for')
|
||||
.orWhereNull('aliases.alias_for');
|
||||
})
|
||||
.groupBy('aliases.id');
|
||||
.select('tags.id', 'tags.name', 'tags.alias_for')
|
||||
.whereIn('tags.name', casedTags);
|
||||
|
||||
return tagEntries;
|
||||
const tagIdsBySlug = tagEntries
|
||||
.reduce((acc, tag) => ({
|
||||
...acc,
|
||||
[slugify(tag.name)]: tag.alias_for || tag.id,
|
||||
}), {});
|
||||
|
||||
return tagIdsBySlug;
|
||||
}
|
||||
|
||||
async function associateTags(release, releaseId) {
|
||||
const siteTags = release.site?.tags?.filter(tag => tag.inherit === true).map(tag => tag.id) || [];
|
||||
async function getSiteTags(releases) {
|
||||
const siteIds = releases.map(release => release.site.id);
|
||||
const siteTags = await knex('sites_tags').whereIn('site_id', siteIds);
|
||||
|
||||
const rawReleaseTags = release.tags?.filter(Boolean) || [];
|
||||
const releaseTags = rawReleaseTags.some(tag => typeof tag === 'string')
|
||||
? await matchTags(release.tags) // scraper returned raw tags
|
||||
: rawReleaseTags; // tags already matched by (outdated) scraper
|
||||
|
||||
const tags = Array.from(new Set(releaseTags.concat(siteTags)));
|
||||
|
||||
if (tags.length === 0) {
|
||||
logger.info(`No tags available for (${release.site.name}, ${releaseId}) "${release.title}"`);
|
||||
return;
|
||||
const siteTagIdsBySiteId = siteTags.reduce((acc, siteTag) => {
|
||||
if (!acc[siteTag.site_id]) {
|
||||
acc[siteTag.site_id] = [];
|
||||
}
|
||||
|
||||
const associationEntries = await knex('releases_tags')
|
||||
.where('release_id', releaseId)
|
||||
.whereIn('tag_id', tags);
|
||||
acc[siteTag.site_id].push(siteTag.tag_id);
|
||||
|
||||
const existingAssociations = new Set(associationEntries.map(association => association.tag_id));
|
||||
const newAssociations = tags.filter(tagId => !existingAssociations.has(tagId));
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
await knex('releases_tags').insert(newAssociations.map(tagId => ({
|
||||
return siteTagIdsBySiteId;
|
||||
}
|
||||
|
||||
function buildReleaseTagAssociations(releases, tagIdsBySlug, siteTagIdsBySiteId) {
|
||||
const tagAssociations = releases
|
||||
.map((release) => {
|
||||
const siteTagIds = siteTagIdsBySiteId[release.site.id];
|
||||
|
||||
const releaseTagIds = release.tags.every(tag => typeof tag === 'number')
|
||||
? release.tags // obsolete scraper returned pre-matched tags
|
||||
: release.tags.map(tag => tagIdsBySlug[slugify(tag)]);
|
||||
|
||||
return Array.from(new Set(
|
||||
// filter duplicates and empties
|
||||
releaseTagIds
|
||||
.concat(siteTagIds)
|
||||
.filter(Boolean),
|
||||
))
|
||||
.map(tagId => ({
|
||||
release_id: release.id,
|
||||
tag_id: tagId,
|
||||
release_id: releaseId,
|
||||
})));
|
||||
}));
|
||||
})
|
||||
.flat();
|
||||
|
||||
return tagAssociations;
|
||||
}
|
||||
|
||||
async function fetchTags(queryObject, groupsQueryObject, limit = 100) {
|
||||
const tags = await knex('tags')
|
||||
.where(builder => whereOr(queryObject, 'tags', builder))
|
||||
.orWhere(builder => whereOr(groupsQueryObject, 'tags_groups', builder))
|
||||
.andWhere({ 'tags.alias_for': null })
|
||||
.select(
|
||||
'tags.*',
|
||||
'tags_groups.id as group_id', 'tags_groups.name as group_name', 'tags_groups.slug as group_slug', 'tags_groups.description as groups_description',
|
||||
)
|
||||
.leftJoin('tags_groups', 'tags.group_id', 'tags_groups.id')
|
||||
.orderBy('name')
|
||||
.limit(limit);
|
||||
async function extractUniqueAssociations(tagAssociations) {
|
||||
const duplicateAssociations = await knex('releases_tags').whereIn(['release_id', 'tag_id'], tagAssociations.map(association => [association.release_id, association.tag_id]));
|
||||
|
||||
return curateTags(tags);
|
||||
const duplicateAssociationsByReleaseIdAndTagId = duplicateAssociations.reduce((acc, association) => {
|
||||
if (!acc[association.release_id]) {
|
||||
acc[association.release_id] = {};
|
||||
}
|
||||
|
||||
acc[association.release_id][association.tag_id] = true;
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
const uniqueAssociations = tagAssociations
|
||||
.filter(association => !duplicateAssociationsByReleaseIdAndTagId[association.release_id]?.[association.tag_id]);
|
||||
|
||||
return uniqueAssociations;
|
||||
}
|
||||
|
||||
async function associateTags(releases) {
|
||||
const tagIdsBySlug = await matchReleaseTags(releases);
|
||||
const siteTagIdsBySiteId = await getSiteTags(releases);
|
||||
|
||||
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, siteTagIdsBySiteId);
|
||||
const uniqueAssociations = extractUniqueAssociations(tagAssociations);
|
||||
|
||||
await knex('releases_tags').insert(uniqueAssociations);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
associateTags,
|
||||
fetchTags,
|
||||
matchTags,
|
||||
};
|
||||
|
|
|
@ -34,7 +34,7 @@ async function extractUniqueReleases(latestReleases, accReleases) {
|
|||
|
||||
// add entry IDs of accumulated releases to prevent an infinite scrape loop
|
||||
// when one page contains the same release as the previous
|
||||
const duplicateReleaseIdentifiers = duplicateReleases
|
||||
const duplicateReleasesSiteIdAndEntryIds = duplicateReleases
|
||||
.concat(accReleases)
|
||||
.reduce((acc, release) => {
|
||||
const siteId = release.site_id || release.site.id;
|
||||
|
@ -47,7 +47,7 @@ async function extractUniqueReleases(latestReleases, accReleases) {
|
|||
}, {});
|
||||
|
||||
const uniqueReleases = latestReleases
|
||||
.filter(release => !duplicateReleaseIdentifiers[release.site.id]?.[release.entryId]);
|
||||
.filter(release => !duplicateReleasesSiteIdAndEntryIds[release.site.id]?.[release.entryId]);
|
||||
|
||||
return uniqueReleases;
|
||||
}
|
||||
|
|
|
@ -52,11 +52,11 @@ queue.define('http', async ({
|
|||
|
||||
const reqOptions = {
|
||||
headers: {
|
||||
...(options.defaultHeaders !== false && defaultHeaders),
|
||||
...headers,
|
||||
...defaultHeaders,
|
||||
},
|
||||
...options,
|
||||
...defaultOptions,
|
||||
...options,
|
||||
...(options.timeout && { responseTimeout: options.timeout }),
|
||||
};
|
||||
|
||||
|
|
|
@ -288,9 +288,7 @@ function extractAll(htmlValue, selector) {
|
|||
}
|
||||
|
||||
async function get(urlValue, selector, headers, options, queryAll = false) {
|
||||
const res = await http.get(urlValue, {
|
||||
headers,
|
||||
});
|
||||
const res = await http.get(urlValue, headers);
|
||||
|
||||
if (res.statusCode === 200) {
|
||||
const item = queryAll
|
||||
|
|
|
@ -4,6 +4,10 @@ function slugify(string, delimiter = '-', {
|
|||
encode = false,
|
||||
limit = 1000,
|
||||
} = {}) {
|
||||
if (!string) {
|
||||
return '';
|
||||
}
|
||||
|
||||
const slugComponents = string.trim().toLowerCase().match(/\w+/g);
|
||||
|
||||
if (!slugComponents) {
|
||||
|
|