Matching URLs to entity using hostname rather than slug to minimize collisions. Fixed missing Cum Louder POV logo.
Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB |
Before Width: | Height: | Size: 3.9 KiB After Width: | Height: | Size: 3.9 KiB |
Before Width: | Height: | Size: 2.7 KiB After Width: | Height: | Size: 2.8 KiB |
Before Width: | Height: | Size: 2.4 KiB After Width: | Height: | Size: 2.5 KiB |
Before Width: | Height: | Size: 7.2 KiB After Width: | Height: | Size: 7.2 KiB |
Before Width: | Height: | Size: 6.8 KiB After Width: | Height: | Size: 6.8 KiB |
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 5.2 KiB After Width: | Height: | Size: 5.3 KiB |
After Width: | Height: | Size: 3.2 KiB |
Before Width: | Height: | Size: 3.0 KiB After Width: | Height: | Size: 3.1 KiB |
Before Width: | Height: | Size: 2.9 KiB After Width: | Height: | Size: 2.9 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 7.2 KiB After Width: | Height: | Size: 7.2 KiB |
Before Width: | Height: | Size: 3.9 KiB After Width: | Height: | Size: 3.9 KiB |
Before Width: | Height: | Size: 4.8 KiB After Width: | Height: | Size: 4.8 KiB |
Before Width: | Height: | Size: 4.0 KiB After Width: | Height: | Size: 4.1 KiB |
Before Width: | Height: | Size: 5.6 KiB After Width: | Height: | Size: 5.7 KiB |
Before Width: | Height: | Size: 6.9 KiB After Width: | Height: | Size: 6.9 KiB |
Before Width: | Height: | Size: 5.2 KiB After Width: | Height: | Size: 5.3 KiB |
Before Width: | Height: | Size: 5.5 KiB After Width: | Height: | Size: 5.5 KiB |
Before Width: | Height: | Size: 4.4 KiB After Width: | Height: | Size: 4.5 KiB |
Before Width: | Height: | Size: 5.3 KiB After Width: | Height: | Size: 5.4 KiB |
Before Width: | Height: | Size: 5.2 KiB After Width: | Height: | Size: 5.2 KiB |
Before Width: | Height: | Size: 3.4 KiB After Width: | Height: | Size: 3.4 KiB |
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 19 KiB |
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 16 KiB |
Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB |
Before Width: | Height: | Size: 12 KiB After Width: | Height: | Size: 12 KiB |
Before Width: | Height: | Size: 27 KiB After Width: | Height: | Size: 27 KiB |
Before Width: | Height: | Size: 8.9 KiB After Width: | Height: | Size: 9.0 KiB |
After Width: | Height: | Size: 12 KiB |
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 19 KiB |
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 9.0 KiB After Width: | Height: | Size: 9.0 KiB |
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 19 KiB |
Before Width: | Height: | Size: 13 KiB After Width: | Height: | Size: 13 KiB |
Before Width: | Height: | Size: 17 KiB After Width: | Height: | Size: 18 KiB |
Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB |
Before Width: | Height: | Size: 12 KiB After Width: | Height: | Size: 12 KiB |
Before Width: | Height: | Size: 8.9 KiB After Width: | Height: | Size: 9.0 KiB |
Before Width: | Height: | Size: 18 KiB After Width: | Height: | Size: 18 KiB |
Before Width: | Height: | Size: 9.9 KiB After Width: | Height: | Size: 10 KiB |
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 16 KiB |
Before Width: | Height: | Size: 10 KiB After Width: | Height: | Size: 10 KiB |
Before Width: | Height: | Size: 17 KiB After Width: | Height: | Size: 17 KiB |
19
src/deep.js
|
@ -7,7 +7,7 @@ const { mergeAdvanced: merge } = require('object-merge-advanced');
|
|||
|
||||
const argv = require('./argv');
|
||||
const include = require('./utils/argv-include')(argv);
|
||||
const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
|
||||
const { fetchReleaseEntities, urlToHostname } = require('./entities');
|
||||
const logger = require('./logger')(__filename);
|
||||
const qu = require('./utils/qu');
|
||||
const getRecursiveParameters = require('./utils/get-recursive-parameters');
|
||||
|
@ -110,8 +110,11 @@ function fetchMovie(scraper, url, entity, baseRelease, options) {
|
|||
return fetchScene(scraper, url, entity, baseRelease, options, 'movie');
|
||||
}
|
||||
|
||||
async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
||||
const entity = baseRelease.entity || entitiesBySlug[urlToSiteSlug(baseRelease.url)];
|
||||
async function scrapeRelease(baseRelease, entitiesByHostname, type = 'scene') {
|
||||
const entity = baseRelease.entity || entitiesByHostname[urlToHostname(baseRelease.url)];
|
||||
|
||||
console.log(entitiesByHostname);
|
||||
console.log(entity);
|
||||
|
||||
if (!entity) {
|
||||
logger.warn(`No entity available for ${baseRelease.url}`);
|
||||
|
@ -222,10 +225,8 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
|||
}
|
||||
}
|
||||
|
||||
async function scrapeReleases(baseReleases, entitiesBySlug, type) {
|
||||
const entitiesWithBeforeDataEntries = await Promise.all(Object.entries(entitiesBySlug).map(async ([slug, entity]) => {
|
||||
console.log('scraper', entity.scraper?.beforeFetchScenes);
|
||||
|
||||
async function scrapeReleases(baseReleases, entitiesByHostname, type) {
|
||||
const entitiesWithBeforeDataEntries = await Promise.all(Object.entries(entitiesByHostname).map(async ([slug, entity]) => {
|
||||
if (entity.scraper?.beforeFetchScenes) {
|
||||
const parameters = getRecursiveParameters(entity);
|
||||
const preData = await entity.scraper.beforeFetchScenes(entity, parameters);
|
||||
|
@ -249,9 +250,9 @@ async function scrapeReleases(baseReleases, entitiesBySlug, type) {
|
|||
|
||||
async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
|
||||
const baseReleases = toBaseReleases(baseReleasesOrUrls);
|
||||
const entitiesBySlug = await fetchReleaseEntities(baseReleases);
|
||||
const entitiesByHostname = await fetchReleaseEntities(baseReleases);
|
||||
|
||||
const deepReleases = await scrapeReleases(baseReleases, entitiesBySlug, type);
|
||||
const deepReleases = await scrapeReleases(baseReleases, entitiesByHostname, type);
|
||||
|
||||
return deepReleases.filter(Boolean);
|
||||
}
|
||||
|
|
|
@ -82,6 +82,7 @@ async function curateEntities(entities, includeParameters) {
|
|||
return Promise.all(entities.map(async (entity) => curateEntity(entity, includeParameters)));
|
||||
}
|
||||
|
||||
/* obsolete in favor of urlToHostname
|
||||
function urlToSiteSlug(url) {
|
||||
try {
|
||||
const slug = new URL(url)
|
||||
|
@ -96,6 +97,21 @@ function urlToSiteSlug(url) {
|
|||
return null;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
function urlToHostname(url) {
|
||||
try {
|
||||
const hostname = new URL(url)
|
||||
.hostname
|
||||
.match(/(www\.)(.*)/)?.at(-1);
|
||||
|
||||
return hostname;
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to derive entity hostname from '${url}': ${error.message}`);
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchIncludedEntities() {
|
||||
const include = {
|
||||
|
@ -191,6 +207,7 @@ async function fetchEntitiesBySlug(entitySlugs, sort = 'asc') {
|
|||
array['parent'] as parent_path
|
||||
FROM entities
|
||||
WHERE slug = ANY(:entitySlugs)
|
||||
OR url ILIKE ANY(:entityHosts)
|
||||
|
||||
UNION ALL
|
||||
|
||||
|
@ -215,14 +232,23 @@ async function fetchEntitiesBySlug(entitySlugs, sort = 'asc') {
|
|||
WHERE entity_tree.parent_id IS NULL
|
||||
GROUP BY entity_tree.entity
|
||||
ORDER BY entity->'type' :sort;
|
||||
`, { entitySlugs, sort: knex.raw(sort) });
|
||||
`, {
|
||||
entitySlugs: entitySlugs.filter((slug) => !slug.includes('.')),
|
||||
entityHosts: entitySlugs.filter((slug) => slug.includes('.')).map((hostname) => `%${hostname}%`),
|
||||
sort: knex.raw(sort),
|
||||
});
|
||||
|
||||
// channel entity will overwrite network entity
|
||||
const entitiesBySlug = entities.rows.reduce((accEntities, { entity }) => ({
|
||||
...accEntities,
|
||||
[entity.slug]: accEntities[entity.slug] || curateEntity(entity, true),
|
||||
[urlToSiteSlug(entity.url)]: accEntities[urlToSiteSlug(entity.url)] || curateEntity(entity, true),
|
||||
}), {});
|
||||
const entitiesBySlug = entities.rows.reduce((accEntities, { entity }) => {
|
||||
const host = urlToHostname(entity.url);
|
||||
const curatedEntity = accEntities[entity.slug] || accEntities[host] || curateEntity(entity, true);
|
||||
|
||||
return {
|
||||
...accEntities,
|
||||
[entity.slug]: curatedEntity,
|
||||
[host]: curatedEntity,
|
||||
};
|
||||
}, {});
|
||||
|
||||
return entitiesBySlug;
|
||||
}
|
||||
|
@ -232,7 +258,7 @@ async function fetchReleaseEntities(baseReleases) {
|
|||
|
||||
const entitySlugs = Array.from(new Set(
|
||||
baseReleasesWithoutEntity
|
||||
.map((baseRelease) => urlToSiteSlug(baseRelease.url))
|
||||
.map((baseRelease) => urlToHostname(baseRelease.url))
|
||||
.filter(Boolean),
|
||||
));
|
||||
|
||||
|
@ -409,5 +435,6 @@ module.exports = {
|
|||
getRecursiveParent,
|
||||
searchEntities,
|
||||
flushEntities,
|
||||
urlToSiteSlug,
|
||||
urlToHostname,
|
||||
// urlToSiteSlug,
|
||||
};
|
||||
|
|
|
@ -22,6 +22,8 @@ function scrapeBlockLatest(scenes) {
|
|||
|
||||
release.teaser = qu.video();
|
||||
|
||||
console.log(release);
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
|
@ -57,6 +59,15 @@ function scrapeClassicLatest(scenes) {
|
|||
});
|
||||
}
|
||||
|
||||
function scrapeScene({ query }) {
|
||||
const release = {};
|
||||
|
||||
release.title = query.content('.indScene h2');
|
||||
|
||||
console.log(release);
|
||||
return release;
|
||||
}
|
||||
|
||||
async function fetchLatest(site, page = 1) {
|
||||
if (!site.parameters) {
|
||||
return null;
|
||||
|
@ -74,4 +85,6 @@ async function fetchLatest(site, page = 1) {
|
|||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
scrapeScene,
|
||||
useUnprint: true,
|
||||
};
|
||||
|
|
|
@ -481,10 +481,14 @@ async function addReleaseMedia(medias, release, target) {
|
|||
await transferMedia(media, target);
|
||||
}
|
||||
|
||||
await knex(`${release.type}s_${target}`).insert({
|
||||
[`${release.type}_id`]: release.id,
|
||||
media_id: id,
|
||||
});
|
||||
try {
|
||||
await knex(`${release.type}s_${target}`).insert({
|
||||
[`${release.type}_id`]: release.id,
|
||||
media_id: id,
|
||||
});
|
||||
} catch (error) {
|
||||
console.warn(`Ignored duplicate ${release.type} ${target} association ${media.hash} with ${release.id} "${release.title}"`);
|
||||
}
|
||||
}, Promise.resolve());
|
||||
}
|
||||
|
||||
|
@ -496,8 +500,6 @@ async function linkMovieScenes(release, context) {
|
|||
&& storedMovie.entity.slug === linkedMovie.entity.slug
|
||||
&& storedMovie.entity.type === linkedMovie.entity.type);
|
||||
|
||||
console.log('movie', linkedMovie, movie);
|
||||
|
||||
if (!movie) {
|
||||
throw new Error(`Missing ${linkedMovie.entity.slug} movie '${linkedMovie.title}' in '${release.title}'`);
|
||||
}
|
||||
|
@ -534,7 +536,7 @@ async function addRelease(release, context) {
|
|||
]);
|
||||
|
||||
if (!entity) {
|
||||
throw new Error(`Release contains non-existent ${release.entity.type} '${release.entity.slug}'`);
|
||||
throw new Error(`Release "${release.title}" contains non-existent ${release.entity.type} '${release.entity.slug}'`);
|
||||
}
|
||||
|
||||
const [releaseEntry] = await knex(`${release.type}s`)
|
||||
|
|