Matching URLs to entity using hostname rather than slug to minimize collisions. Fixed missing Cum Louder POV logo.
Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB |
Before Width: | Height: | Size: 3.9 KiB After Width: | Height: | Size: 3.9 KiB |
Before Width: | Height: | Size: 2.7 KiB After Width: | Height: | Size: 2.8 KiB |
Before Width: | Height: | Size: 2.4 KiB After Width: | Height: | Size: 2.5 KiB |
Before Width: | Height: | Size: 7.2 KiB After Width: | Height: | Size: 7.2 KiB |
Before Width: | Height: | Size: 6.8 KiB After Width: | Height: | Size: 6.8 KiB |
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 5.2 KiB After Width: | Height: | Size: 5.3 KiB |
After Width: | Height: | Size: 3.2 KiB |
Before Width: | Height: | Size: 3.0 KiB After Width: | Height: | Size: 3.1 KiB |
Before Width: | Height: | Size: 2.9 KiB After Width: | Height: | Size: 2.9 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 7.2 KiB After Width: | Height: | Size: 7.2 KiB |
Before Width: | Height: | Size: 3.9 KiB After Width: | Height: | Size: 3.9 KiB |
Before Width: | Height: | Size: 4.8 KiB After Width: | Height: | Size: 4.8 KiB |
Before Width: | Height: | Size: 4.0 KiB After Width: | Height: | Size: 4.1 KiB |
Before Width: | Height: | Size: 5.6 KiB After Width: | Height: | Size: 5.7 KiB |
Before Width: | Height: | Size: 6.9 KiB After Width: | Height: | Size: 6.9 KiB |
Before Width: | Height: | Size: 5.2 KiB After Width: | Height: | Size: 5.3 KiB |
Before Width: | Height: | Size: 5.5 KiB After Width: | Height: | Size: 5.5 KiB |
Before Width: | Height: | Size: 4.4 KiB After Width: | Height: | Size: 4.5 KiB |
Before Width: | Height: | Size: 5.3 KiB After Width: | Height: | Size: 5.4 KiB |
Before Width: | Height: | Size: 5.2 KiB After Width: | Height: | Size: 5.2 KiB |
Before Width: | Height: | Size: 3.4 KiB After Width: | Height: | Size: 3.4 KiB |
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 19 KiB |
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 16 KiB |
Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB |
Before Width: | Height: | Size: 12 KiB After Width: | Height: | Size: 12 KiB |
Before Width: | Height: | Size: 27 KiB After Width: | Height: | Size: 27 KiB |
Before Width: | Height: | Size: 8.9 KiB After Width: | Height: | Size: 9.0 KiB |
After Width: | Height: | Size: 12 KiB |
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 19 KiB |
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 9.0 KiB After Width: | Height: | Size: 9.0 KiB |
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 19 KiB |
Before Width: | Height: | Size: 13 KiB After Width: | Height: | Size: 13 KiB |
Before Width: | Height: | Size: 17 KiB After Width: | Height: | Size: 18 KiB |
Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB |
Before Width: | Height: | Size: 12 KiB After Width: | Height: | Size: 12 KiB |
Before Width: | Height: | Size: 8.9 KiB After Width: | Height: | Size: 9.0 KiB |
Before Width: | Height: | Size: 18 KiB After Width: | Height: | Size: 18 KiB |
Before Width: | Height: | Size: 9.9 KiB After Width: | Height: | Size: 10 KiB |
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 16 KiB |
Before Width: | Height: | Size: 10 KiB After Width: | Height: | Size: 10 KiB |
Before Width: | Height: | Size: 17 KiB After Width: | Height: | Size: 17 KiB |
19
src/deep.js
|
@ -7,7 +7,7 @@ const { mergeAdvanced: merge } = require('object-merge-advanced');
|
||||||
|
|
||||||
const argv = require('./argv');
|
const argv = require('./argv');
|
||||||
const include = require('./utils/argv-include')(argv);
|
const include = require('./utils/argv-include')(argv);
|
||||||
const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
|
const { fetchReleaseEntities, urlToHostname } = require('./entities');
|
||||||
const logger = require('./logger')(__filename);
|
const logger = require('./logger')(__filename);
|
||||||
const qu = require('./utils/qu');
|
const qu = require('./utils/qu');
|
||||||
const getRecursiveParameters = require('./utils/get-recursive-parameters');
|
const getRecursiveParameters = require('./utils/get-recursive-parameters');
|
||||||
|
@ -110,8 +110,11 @@ function fetchMovie(scraper, url, entity, baseRelease, options) {
|
||||||
return fetchScene(scraper, url, entity, baseRelease, options, 'movie');
|
return fetchScene(scraper, url, entity, baseRelease, options, 'movie');
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
async function scrapeRelease(baseRelease, entitiesByHostname, type = 'scene') {
|
||||||
const entity = baseRelease.entity || entitiesBySlug[urlToSiteSlug(baseRelease.url)];
|
const entity = baseRelease.entity || entitiesByHostname[urlToHostname(baseRelease.url)];
|
||||||
|
|
||||||
|
console.log(entitiesByHostname);
|
||||||
|
console.log(entity);
|
||||||
|
|
||||||
if (!entity) {
|
if (!entity) {
|
||||||
logger.warn(`No entity available for ${baseRelease.url}`);
|
logger.warn(`No entity available for ${baseRelease.url}`);
|
||||||
|
@ -222,10 +225,8 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeReleases(baseReleases, entitiesBySlug, type) {
|
async function scrapeReleases(baseReleases, entitiesByHostname, type) {
|
||||||
const entitiesWithBeforeDataEntries = await Promise.all(Object.entries(entitiesBySlug).map(async ([slug, entity]) => {
|
const entitiesWithBeforeDataEntries = await Promise.all(Object.entries(entitiesByHostname).map(async ([slug, entity]) => {
|
||||||
console.log('scraper', entity.scraper?.beforeFetchScenes);
|
|
||||||
|
|
||||||
if (entity.scraper?.beforeFetchScenes) {
|
if (entity.scraper?.beforeFetchScenes) {
|
||||||
const parameters = getRecursiveParameters(entity);
|
const parameters = getRecursiveParameters(entity);
|
||||||
const preData = await entity.scraper.beforeFetchScenes(entity, parameters);
|
const preData = await entity.scraper.beforeFetchScenes(entity, parameters);
|
||||||
|
@ -249,9 +250,9 @@ async function scrapeReleases(baseReleases, entitiesBySlug, type) {
|
||||||
|
|
||||||
async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
|
async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
|
||||||
const baseReleases = toBaseReleases(baseReleasesOrUrls);
|
const baseReleases = toBaseReleases(baseReleasesOrUrls);
|
||||||
const entitiesBySlug = await fetchReleaseEntities(baseReleases);
|
const entitiesByHostname = await fetchReleaseEntities(baseReleases);
|
||||||
|
|
||||||
const deepReleases = await scrapeReleases(baseReleases, entitiesBySlug, type);
|
const deepReleases = await scrapeReleases(baseReleases, entitiesByHostname, type);
|
||||||
|
|
||||||
return deepReleases.filter(Boolean);
|
return deepReleases.filter(Boolean);
|
||||||
}
|
}
|
||||||
|
|
|
@ -82,6 +82,7 @@ async function curateEntities(entities, includeParameters) {
|
||||||
return Promise.all(entities.map(async (entity) => curateEntity(entity, includeParameters)));
|
return Promise.all(entities.map(async (entity) => curateEntity(entity, includeParameters)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* obsolete in favor of urlToHostname
|
||||||
function urlToSiteSlug(url) {
|
function urlToSiteSlug(url) {
|
||||||
try {
|
try {
|
||||||
const slug = new URL(url)
|
const slug = new URL(url)
|
||||||
|
@ -96,6 +97,21 @@ function urlToSiteSlug(url) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
function urlToHostname(url) {
|
||||||
|
try {
|
||||||
|
const hostname = new URL(url)
|
||||||
|
.hostname
|
||||||
|
.match(/(www\.)(.*)/)?.at(-1);
|
||||||
|
|
||||||
|
return hostname;
|
||||||
|
} catch (error) {
|
||||||
|
logger.warn(`Failed to derive entity hostname from '${url}': ${error.message}`);
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function fetchIncludedEntities() {
|
async function fetchIncludedEntities() {
|
||||||
const include = {
|
const include = {
|
||||||
|
@ -191,6 +207,7 @@ async function fetchEntitiesBySlug(entitySlugs, sort = 'asc') {
|
||||||
array['parent'] as parent_path
|
array['parent'] as parent_path
|
||||||
FROM entities
|
FROM entities
|
||||||
WHERE slug = ANY(:entitySlugs)
|
WHERE slug = ANY(:entitySlugs)
|
||||||
|
OR url ILIKE ANY(:entityHosts)
|
||||||
|
|
||||||
UNION ALL
|
UNION ALL
|
||||||
|
|
||||||
|
@ -215,14 +232,23 @@ async function fetchEntitiesBySlug(entitySlugs, sort = 'asc') {
|
||||||
WHERE entity_tree.parent_id IS NULL
|
WHERE entity_tree.parent_id IS NULL
|
||||||
GROUP BY entity_tree.entity
|
GROUP BY entity_tree.entity
|
||||||
ORDER BY entity->'type' :sort;
|
ORDER BY entity->'type' :sort;
|
||||||
`, { entitySlugs, sort: knex.raw(sort) });
|
`, {
|
||||||
|
entitySlugs: entitySlugs.filter((slug) => !slug.includes('.')),
|
||||||
|
entityHosts: entitySlugs.filter((slug) => slug.includes('.')).map((hostname) => `%${hostname}%`),
|
||||||
|
sort: knex.raw(sort),
|
||||||
|
});
|
||||||
|
|
||||||
// channel entity will overwrite network entity
|
// channel entity will overwrite network entity
|
||||||
const entitiesBySlug = entities.rows.reduce((accEntities, { entity }) => ({
|
const entitiesBySlug = entities.rows.reduce((accEntities, { entity }) => {
|
||||||
|
const host = urlToHostname(entity.url);
|
||||||
|
const curatedEntity = accEntities[entity.slug] || accEntities[host] || curateEntity(entity, true);
|
||||||
|
|
||||||
|
return {
|
||||||
...accEntities,
|
...accEntities,
|
||||||
[entity.slug]: accEntities[entity.slug] || curateEntity(entity, true),
|
[entity.slug]: curatedEntity,
|
||||||
[urlToSiteSlug(entity.url)]: accEntities[urlToSiteSlug(entity.url)] || curateEntity(entity, true),
|
[host]: curatedEntity,
|
||||||
}), {});
|
};
|
||||||
|
}, {});
|
||||||
|
|
||||||
return entitiesBySlug;
|
return entitiesBySlug;
|
||||||
}
|
}
|
||||||
|
@ -232,7 +258,7 @@ async function fetchReleaseEntities(baseReleases) {
|
||||||
|
|
||||||
const entitySlugs = Array.from(new Set(
|
const entitySlugs = Array.from(new Set(
|
||||||
baseReleasesWithoutEntity
|
baseReleasesWithoutEntity
|
||||||
.map((baseRelease) => urlToSiteSlug(baseRelease.url))
|
.map((baseRelease) => urlToHostname(baseRelease.url))
|
||||||
.filter(Boolean),
|
.filter(Boolean),
|
||||||
));
|
));
|
||||||
|
|
||||||
|
@ -409,5 +435,6 @@ module.exports = {
|
||||||
getRecursiveParent,
|
getRecursiveParent,
|
||||||
searchEntities,
|
searchEntities,
|
||||||
flushEntities,
|
flushEntities,
|
||||||
urlToSiteSlug,
|
urlToHostname,
|
||||||
|
// urlToSiteSlug,
|
||||||
};
|
};
|
||||||
|
|
|
@ -22,6 +22,8 @@ function scrapeBlockLatest(scenes) {
|
||||||
|
|
||||||
release.teaser = qu.video();
|
release.teaser = qu.video();
|
||||||
|
|
||||||
|
console.log(release);
|
||||||
|
|
||||||
return release;
|
return release;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -57,6 +59,15 @@ function scrapeClassicLatest(scenes) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function scrapeScene({ query }) {
|
||||||
|
const release = {};
|
||||||
|
|
||||||
|
release.title = query.content('.indScene h2');
|
||||||
|
|
||||||
|
console.log(release);
|
||||||
|
return release;
|
||||||
|
}
|
||||||
|
|
||||||
async function fetchLatest(site, page = 1) {
|
async function fetchLatest(site, page = 1) {
|
||||||
if (!site.parameters) {
|
if (!site.parameters) {
|
||||||
return null;
|
return null;
|
||||||
|
@ -74,4 +85,6 @@ async function fetchLatest(site, page = 1) {
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
fetchLatest,
|
fetchLatest,
|
||||||
|
scrapeScene,
|
||||||
|
useUnprint: true,
|
||||||
};
|
};
|
||||||
|
|
|
@ -481,10 +481,14 @@ async function addReleaseMedia(medias, release, target) {
|
||||||
await transferMedia(media, target);
|
await transferMedia(media, target);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
await knex(`${release.type}s_${target}`).insert({
|
await knex(`${release.type}s_${target}`).insert({
|
||||||
[`${release.type}_id`]: release.id,
|
[`${release.type}_id`]: release.id,
|
||||||
media_id: id,
|
media_id: id,
|
||||||
});
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(`Ignored duplicate ${release.type} ${target} association ${media.hash} with ${release.id} "${release.title}"`);
|
||||||
|
}
|
||||||
}, Promise.resolve());
|
}, Promise.resolve());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -496,8 +500,6 @@ async function linkMovieScenes(release, context) {
|
||||||
&& storedMovie.entity.slug === linkedMovie.entity.slug
|
&& storedMovie.entity.slug === linkedMovie.entity.slug
|
||||||
&& storedMovie.entity.type === linkedMovie.entity.type);
|
&& storedMovie.entity.type === linkedMovie.entity.type);
|
||||||
|
|
||||||
console.log('movie', linkedMovie, movie);
|
|
||||||
|
|
||||||
if (!movie) {
|
if (!movie) {
|
||||||
throw new Error(`Missing ${linkedMovie.entity.slug} movie '${linkedMovie.title}' in '${release.title}'`);
|
throw new Error(`Missing ${linkedMovie.entity.slug} movie '${linkedMovie.title}' in '${release.title}'`);
|
||||||
}
|
}
|
||||||
|
@ -534,7 +536,7 @@ async function addRelease(release, context) {
|
||||||
]);
|
]);
|
||||||
|
|
||||||
if (!entity) {
|
if (!entity) {
|
||||||
throw new Error(`Release contains non-existent ${release.entity.type} '${release.entity.slug}'`);
|
throw new Error(`Release "${release.title}" contains non-existent ${release.entity.type} '${release.entity.slug}'`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const [releaseEntry] = await knex(`${release.type}s`)
|
const [releaseEntry] = await knex(`${release.type}s`)
|
||||||
|
|