Querying infinite parent depth for deep release entities.
After Width: | Height: | Size: 410 KiB |
After Width: | Height: | Size: 8.0 KiB |
After Width: | Height: | Size: 261 KiB |
After Width: | Height: | Size: 34 KiB |
After Width: | Height: | Size: 541 KiB |
After Width: | Height: | Size: 5.3 KiB |
After Width: | Height: | Size: 353 KiB |
After Width: | Height: | Size: 24 KiB |
|
@ -685,12 +685,13 @@ const tagPhotos = [
|
||||||
['69', 2, 'Abigail Mac and Kissa Sins in "Lesbian Anal Workout" for HardX'],
|
['69', 2, 'Abigail Mac and Kissa Sins in "Lesbian Anal Workout" for HardX'],
|
||||||
['airtight', 7, 'Lana Rhoades in "Gangbang Me 3" for HardX'],
|
['airtight', 7, 'Lana Rhoades in "Gangbang Me 3" for HardX'],
|
||||||
['airtight', 6, 'Remy Lacroix in "Ass Worship 14" for Jules Jordan'],
|
['airtight', 6, 'Remy Lacroix in "Ass Worship 14" for Jules Jordan'],
|
||||||
|
['airtight', 11, 'Malena Nazionale in "Rocco\'s Perverted Secretaries 2: Italian Edition" for Rocco Siffredi'],
|
||||||
|
['airtight', 1, 'Jynx Maze in "Pump My Ass Full of Cum 3" for Jules Jordan'],
|
||||||
['airtight', 10, 'Asa Akira in "Asa Akira To The Limit" for Jules Jordan'],
|
['airtight', 10, 'Asa Akira in "Asa Akira To The Limit" for Jules Jordan'],
|
||||||
['airtight', 8, 'Veronica Leal in LegalPorno SZ2520'],
|
['airtight', 8, 'Veronica Leal in LegalPorno SZ2520'],
|
||||||
['airtight', 5, 'Chloe Amour in "DP Masters 4" for Jules Jordan'],
|
|
||||||
['airtight', 3, 'Anita Bellini in "Triple Dick Gangbang" for Hands On Hardcore (DDF Network)'],
|
['airtight', 3, 'Anita Bellini in "Triple Dick Gangbang" for Hands On Hardcore (DDF Network)'],
|
||||||
|
['airtight', 5, 'Chloe Amour in "DP Masters 4" for Jules Jordan'],
|
||||||
['airtight', 9, 'Cindy Shine in LegalPorno GP1658'],
|
['airtight', 9, 'Cindy Shine in LegalPorno GP1658'],
|
||||||
['airtight', 1, 'Jynx Maze in "Pump My Ass Full of Cum 3" for Jules Jordan'],
|
|
||||||
['atm', 3, 'Natasha Teen in "Work That Ass!" for Her Limit'],
|
['atm', 3, 'Natasha Teen in "Work That Ass!" for Her Limit'],
|
||||||
['atm', 0, 'Roxy Lips in "Under Her Coat" for 21 Naturals'],
|
['atm', 0, 'Roxy Lips in "Under Her Coat" for 21 Naturals'],
|
||||||
['atm', 6, 'Jane Wilde in "Teen Anal" for Evil Angel'],
|
['atm', 6, 'Jane Wilde in "Teen Anal" for Evil Angel'],
|
||||||
|
@ -873,10 +874,11 @@ const tagPhotos = [
|
||||||
['orgy', 'poster', 'Zoey Mornoe (DP), Jillian Janson (sex), Frida Sante, Katerina Kay and Natasha Starr in "Orgy Masters 6" for Jules Jordan'],
|
['orgy', 'poster', 'Zoey Mornoe (DP), Jillian Janson (sex), Frida Sante, Katerina Kay and Natasha Starr in "Orgy Masters 6" for Jules Jordan'],
|
||||||
['pussy-eating', 4, 'Anastasia Knight and Jillian Janson in "Teach Me" for Screwbox'],
|
['pussy-eating', 4, 'Anastasia Knight and Jillian Janson in "Teach Me" for Screwbox'],
|
||||||
['pussy-eating', 7, 'Jewelz Blu and Katie Kush in "Pick Your Pleasure" for Reality Kings'],
|
['pussy-eating', 7, 'Jewelz Blu and Katie Kush in "Pick Your Pleasure" for Reality Kings'],
|
||||||
['pussy-eating', 6, 'Abella Danger and Karma Rx in "Neon Dreaming" for Brazzers'],
|
['pussy-eating', 8, 'Sia Lust and Lacey London in "Naughty Gamer Girls" for Girls Gone Pink'],
|
||||||
['pussy-eating', 0, 'Kali Roses and Emily Willis\' pussy in "Peeping On My Neighbor" for Girl Girl'],
|
['pussy-eating', 0, 'Kali Roses and Emily Willis\' pussy in "Peeping On My Neighbor" for Girl Girl'],
|
||||||
['pussy-eating', 2, 'Anikka Albrite and Mia Malkova in "Big Anal Bombshells" for LesbianX'],
|
['pussy-eating', 2, 'Anikka Albrite and Mia Malkova in "Big Anal Bombshells" for LesbianX'],
|
||||||
['pussy-eating', 3, 'Kylie Page and Kalina Ryu in "Training My Masseuse" for All Girl Massage'],
|
['pussy-eating', 3, 'Kylie Page and Kalina Ryu in "Training My Masseuse" for All Girl Massage'],
|
||||||
|
['pussy-eating', 6, 'Abella Danger and Karma Rx in "Neon Dreaming" for Brazzers'],
|
||||||
['pussy-eating', 1, 'Anikka Albrite and Riley Reid for In The Crack'],
|
['pussy-eating', 1, 'Anikka Albrite and Riley Reid for In The Crack'],
|
||||||
['redhead', 0, 'Penny Pax in "The Submission of Emma Marx: Boundaries" for New Sensations'],
|
['redhead', 0, 'Penny Pax in "The Submission of Emma Marx: Boundaries" for New Sensations'],
|
||||||
['schoolgirl', 1, 'Eliza Ibarra for Brazzers'],
|
['schoolgirl', 1, 'Eliza Ibarra for Brazzers'],
|
||||||
|
|
80
src/deep.js
|
@ -5,49 +5,11 @@ const merge = require('object-merge-advanced');
|
||||||
|
|
||||||
const argv = require('./argv');
|
const argv = require('./argv');
|
||||||
const include = require('./utils/argv-include')(argv);
|
const include = require('./utils/argv-include')(argv);
|
||||||
|
const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
|
||||||
const logger = require('./logger')(__filename);
|
const logger = require('./logger')(__filename);
|
||||||
const knex = require('./knex');
|
|
||||||
const qu = require('./utils/qu');
|
const qu = require('./utils/qu');
|
||||||
const scrapers = require('./scrapers/scrapers');
|
const scrapers = require('./scrapers/scrapers');
|
||||||
|
|
||||||
function urlToSiteSlug(url) {
|
|
||||||
try {
|
|
||||||
const slug = new URL(url)
|
|
||||||
.hostname
|
|
||||||
.match(/([\w-]+)\.\w+$/)?.[1]
|
|
||||||
.replace(/[-_]+/g, '');
|
|
||||||
|
|
||||||
return slug;
|
|
||||||
} catch (error) {
|
|
||||||
logger.warn(`Failed to derive entity slug from '${url}': ${error.message}`);
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function findEntities(baseReleases) {
|
|
||||||
const baseReleasesWithoutEntity = baseReleases.filter(release => release.url && !release.site && !release.entity);
|
|
||||||
|
|
||||||
const entitySlugs = Array.from(new Set(
|
|
||||||
baseReleasesWithoutEntity
|
|
||||||
.map(baseRelease => urlToSiteSlug(baseRelease.url))
|
|
||||||
.filter(Boolean),
|
|
||||||
));
|
|
||||||
|
|
||||||
const entities = await knex('entities')
|
|
||||||
.select(knex.raw('entities.*, row_to_json(parents) as parent, json_agg(children) as children'))
|
|
||||||
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id')
|
|
||||||
.leftJoin('entities as children', 'children.parent_id', 'entities.id')
|
|
||||||
.whereIn('entities.slug', entitySlugs)
|
|
||||||
.groupBy('entities.id', 'parents.id')
|
|
||||||
.orderBy('entities.type', 'asc');
|
|
||||||
|
|
||||||
// channel entity will overwrite network entity
|
|
||||||
const entitiesBySlug = entities.reduce((accEntities, entity) => ({ ...accEntities, [entity.slug]: accEntities[entity.slug] || entity }), {});
|
|
||||||
|
|
||||||
return entitiesBySlug;
|
|
||||||
}
|
|
||||||
|
|
||||||
function toBaseReleases(baseReleasesOrUrls, entity = null) {
|
function toBaseReleases(baseReleasesOrUrls, entity = null) {
|
||||||
if (!baseReleasesOrUrls) {
|
if (!baseReleasesOrUrls) {
|
||||||
return [];
|
return [];
|
||||||
|
@ -106,8 +68,32 @@ async function fetchScene(scraper, url, entity, baseRelease, options) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeRelease(baseRelease, entities, type = 'scene') {
|
function findScraper(entity) {
|
||||||
const entity = baseRelease.entity || entities[urlToSiteSlug(baseRelease.url)];
|
if (scrapers.releases[entity.slug]) {
|
||||||
|
return scrapers.releases[entity.slug];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (entity.parent) {
|
||||||
|
return findScraper(entity.parent);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function findLayoutScraper(entity, scraper) {
|
||||||
|
if (scraper?.[entity.parameters?.layout]) {
|
||||||
|
return scraper[entity.parameters.layout];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (entity.parent) {
|
||||||
|
return findLayoutScraper(entity.parent, scraper);
|
||||||
|
}
|
||||||
|
|
||||||
|
return scraper;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
||||||
|
const entity = baseRelease.entity || entitiesBySlug[urlToSiteSlug(baseRelease.url)];
|
||||||
|
|
||||||
if (!entity) {
|
if (!entity) {
|
||||||
logger.warn(`No entity available for ${baseRelease.url}`);
|
logger.warn(`No entity available for ${baseRelease.url}`);
|
||||||
|
@ -121,8 +107,8 @@ async function scrapeRelease(baseRelease, entities, type = 'scene') {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const scraper = scrapers.releases[entity.slug] || scrapers.releases[entity.parent?.slug] || scrapers.releases[entity.parent?.parent?.slug];
|
const scraper = findScraper(entity);
|
||||||
const layoutScraper = scraper?.[entity.parameters?.layout] || scraper?.[entity.parent?.parameters?.layout] || scraper?.[entity.parent?.parent?.parameters?.layout] || scraper;
|
const layoutScraper = findLayoutScraper(entity, scraper);
|
||||||
|
|
||||||
if (!layoutScraper) {
|
if (!layoutScraper) {
|
||||||
logger.warn(`Could not find scraper for ${baseRelease.url}`);
|
logger.warn(`Could not find scraper for ${baseRelease.url}`);
|
||||||
|
@ -184,19 +170,19 @@ async function scrapeRelease(baseRelease, entities, type = 'scene') {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeReleases(baseReleases, entities, type) {
|
async function scrapeReleases(baseReleases, entitiesBySlug, type) {
|
||||||
return Promise.map(
|
return Promise.map(
|
||||||
baseReleases,
|
baseReleases,
|
||||||
async baseRelease => scrapeRelease(baseRelease, entities, type),
|
async baseRelease => scrapeRelease(baseRelease, entitiesBySlug, type),
|
||||||
{ concurrency: 10 },
|
{ concurrency: 10 },
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
|
async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
|
||||||
const baseReleases = toBaseReleases(baseReleasesOrUrls);
|
const baseReleases = toBaseReleases(baseReleasesOrUrls);
|
||||||
const entities = await findEntities(baseReleases);
|
const entitiesBySlug = await fetchReleaseEntities(baseReleases);
|
||||||
|
|
||||||
const deepReleases = await scrapeReleases(baseReleases, entities, type);
|
const deepReleases = await scrapeReleases(baseReleases, entitiesBySlug, type);
|
||||||
|
|
||||||
return deepReleases.filter(Boolean);
|
return deepReleases.filter(Boolean);
|
||||||
}
|
}
|
||||||
|
|
|
@ -66,6 +66,21 @@ async function curateEntities(entities, includeParameters) {
|
||||||
return Promise.all(entities.map(async entity => curateEntity(entity, includeParameters)));
|
return Promise.all(entities.map(async entity => curateEntity(entity, includeParameters)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function urlToSiteSlug(url) {
|
||||||
|
try {
|
||||||
|
const slug = new URL(url)
|
||||||
|
.hostname
|
||||||
|
.match(/([\w-]+)\.\w+$/)?.[1]
|
||||||
|
.replace(/[-_]+/g, '');
|
||||||
|
|
||||||
|
return slug;
|
||||||
|
} catch (error) {
|
||||||
|
logger.warn(`Failed to derive entity slug from '${url}': ${error.message}`);
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function fetchIncludedEntities() {
|
async function fetchIncludedEntities() {
|
||||||
const include = {
|
const include = {
|
||||||
includeAll: !argv.networks && !argv.channels && !config.include?.networks && !config.include?.channels,
|
includeAll: !argv.networks && !argv.channels && !config.include?.networks && !config.include?.channels,
|
||||||
|
@ -139,6 +154,46 @@ async function fetchIncludedEntities() {
|
||||||
return curatedNetworks;
|
return curatedNetworks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function fetchReleaseEntities(baseReleases) {
|
||||||
|
const baseReleasesWithoutEntity = baseReleases.filter(release => release.url && !release.site && !release.entity);
|
||||||
|
|
||||||
|
const entitySlugs = Array.from(new Set(
|
||||||
|
baseReleasesWithoutEntity
|
||||||
|
.map(baseRelease => urlToSiteSlug(baseRelease.url))
|
||||||
|
.filter(Boolean),
|
||||||
|
));
|
||||||
|
|
||||||
|
const entities = await knex.raw(`
|
||||||
|
WITH RECURSIVE tree as (
|
||||||
|
SELECT to_jsonb(entities) as entity,
|
||||||
|
parent_id,
|
||||||
|
array['parent'] as parent_path,
|
||||||
|
0 as depth
|
||||||
|
FROM entities
|
||||||
|
WHERE slug = ANY(:entitySlugs)
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT jsonb_set(tree.entity, tree.parent_path, to_jsonb(entities)),
|
||||||
|
entities.parent_id,
|
||||||
|
tree.parent_path || array['parent'],
|
||||||
|
depth + 1
|
||||||
|
FROM tree
|
||||||
|
JOIN entities ON tree.parent_id = entities.id
|
||||||
|
)
|
||||||
|
SELECT entity FROM tree WHERE parent_id is null
|
||||||
|
ORDER BY entity->'type' ASC;
|
||||||
|
`, { entitySlugs });
|
||||||
|
|
||||||
|
// channel entity will overwrite network entity
|
||||||
|
const entitiesBySlug = entities.rows.reduce((accEntities, { entity }) => ({
|
||||||
|
...accEntities,
|
||||||
|
[entity.slug]: accEntities[entity.slug] || curateEntity(entity, true),
|
||||||
|
}), {});
|
||||||
|
|
||||||
|
return entitiesBySlug;
|
||||||
|
}
|
||||||
|
|
||||||
async function fetchEntity(entityId, type) {
|
async function fetchEntity(entityId, type) {
|
||||||
const entity = await knex('entities')
|
const entity = await knex('entities')
|
||||||
.select(knex.raw(`
|
.select(knex.raw(`
|
||||||
|
@ -290,8 +345,10 @@ module.exports = {
|
||||||
curateEntity,
|
curateEntity,
|
||||||
curateEntities,
|
curateEntities,
|
||||||
fetchIncludedEntities,
|
fetchIncludedEntities,
|
||||||
|
fetchReleaseEntities,
|
||||||
fetchEntity,
|
fetchEntity,
|
||||||
fetchEntities,
|
fetchEntities,
|
||||||
searchEntities,
|
searchEntities,
|
||||||
flushEntities,
|
flushEntities,
|
||||||
|
urlToSiteSlug,
|
||||||
};
|
};
|
||||||
|
|
|
@ -164,7 +164,6 @@ function attachReleaseIds(releases, storedReleases) {
|
||||||
|
|
||||||
function filterInternalDuplicateReleases(releases) {
|
function filterInternalDuplicateReleases(releases) {
|
||||||
const releasesByEntityIdAndEntryId = releases.reduce((acc, release) => {
|
const releasesByEntityIdAndEntryId = releases.reduce((acc, release) => {
|
||||||
console.log(release);
|
|
||||||
if (!release.entity) {
|
if (!release.entity) {
|
||||||
return acc;
|
return acc;
|
||||||
}
|
}
|
||||||
|
|