Improved update runner. Improved HTTP module API, added default user agent. Added PornCZ and Czechav logos.
This commit is contained in:
15
src/app.js
15
src/app.js
@@ -6,7 +6,7 @@ const initServer = require('./web/server');
|
||||
|
||||
const knex = require('./knex');
|
||||
const fetchUpdates = require('./updates');
|
||||
const fetchDeep = require('./deep');
|
||||
const { fetchScenes, fetchMovies } = require('./deep');
|
||||
const { storeReleases } = require('./store-releases');
|
||||
const { updateReleasesSearch } = require('./releases');
|
||||
// const { storeReleaseActors } = require('./actors');
|
||||
@@ -23,12 +23,17 @@ async function init() {
|
||||
return;
|
||||
}
|
||||
|
||||
const updateBaseReleases = (argv.scrape || argv.sites || argv.networks) && await fetchUpdates();
|
||||
const updateBaseScenes = (argv.scrape || argv.sites || argv.networks) && await fetchUpdates();
|
||||
const deepScenes = await fetchScenes([...(argv.scenes || []), ...(updateBaseScenes || [])]);
|
||||
|
||||
const updateDeepReleases = updateBaseReleases && await fetchDeep(updateBaseReleases);
|
||||
const argvDeepReleases = argv.scenes && await fetchDeep(argv.scenes);
|
||||
console.log(deepScenes.map(scene => scene.movie));
|
||||
|
||||
await storeReleases([...(updateDeepReleases || []), ...(argvDeepReleases || [])]);
|
||||
const argvDeepMovies = argv.movies && await fetchMovies(argv.movies);
|
||||
|
||||
await storeReleases([
|
||||
...(deepScenes || []),
|
||||
...(argvDeepMovies || []),
|
||||
]);
|
||||
|
||||
// await storeReleaseActors(updateReleases);
|
||||
|
||||
|
||||
35
src/deep.js
35
src/deep.js
@@ -3,6 +3,7 @@
|
||||
const Promise = require('bluebird');
|
||||
|
||||
const argv = require('./argv');
|
||||
const include = require('./utils/argv-include')(argv);
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const scrapers = require('./scrapers/scrapers');
|
||||
@@ -32,7 +33,11 @@ async function findSites(baseReleases) {
|
||||
.filter(Boolean),
|
||||
));
|
||||
|
||||
const siteEntries = await knex('sites').whereIn('slug', siteSlugs);
|
||||
const siteEntries = await knex('sites')
|
||||
.leftJoin('networks', 'networks.id', 'sites.network_id')
|
||||
.select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.parameters as network_parameters', 'networks.description as network_description')
|
||||
.whereIn('sites.slug', siteSlugs);
|
||||
|
||||
const networkEntries = await knex('networks').whereIn('slug', siteSlugs);
|
||||
|
||||
const sites = await curateSites(siteEntries, true, false);
|
||||
@@ -40,7 +45,7 @@ async function findSites(baseReleases) {
|
||||
const markedNetworks = networks.map(network => ({ ...network, isFallback: true }));
|
||||
|
||||
const sitesBySlug = []
|
||||
.concat(sites, markedNetworks)
|
||||
.concat(markedNetworks, sites)
|
||||
.reduce((accSites, site) => ({ ...accSites, [site.slug]: site }), {});
|
||||
|
||||
return sitesBySlug;
|
||||
@@ -108,8 +113,8 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') {
|
||||
|
||||
try {
|
||||
const scrapedRelease = type === 'scene'
|
||||
? await scraper.fetchScene(baseRelease.url, site, baseRelease)
|
||||
: await scraper.fetchMovie(baseRelease.url, site, baseRelease);
|
||||
? await scraper.fetchScene(baseRelease.url, site, baseRelease, null, include)
|
||||
: await scraper.fetchMovie(baseRelease.url, site, baseRelease, null, include);
|
||||
|
||||
const mergedRelease = {
|
||||
...baseRelease,
|
||||
@@ -129,21 +134,33 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') {
|
||||
}
|
||||
}
|
||||
|
||||
async function scrapeReleases(baseReleases, sites) {
|
||||
async function scrapeReleases(baseReleases, sites, type) {
|
||||
return Promise.map(
|
||||
baseReleases,
|
||||
async baseRelease => scrapeRelease(baseRelease, sites),
|
||||
async baseRelease => scrapeRelease(baseRelease, sites, type),
|
||||
{ concurrency: 10 },
|
||||
);
|
||||
}
|
||||
|
||||
async function fetchReleases(baseReleasesOrUrls) {
|
||||
async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
|
||||
const baseReleases = toBaseReleases(baseReleasesOrUrls);
|
||||
const sites = await findSites(baseReleases);
|
||||
|
||||
const deepReleases = await scrapeReleases(baseReleases, sites);
|
||||
const deepReleases = await scrapeReleases(baseReleases, sites, type);
|
||||
|
||||
return deepReleases;
|
||||
}
|
||||
|
||||
module.exports = fetchReleases;
|
||||
async function fetchScenes(baseReleasesOrUrls) {
|
||||
return fetchReleases(baseReleasesOrUrls, 'scene');
|
||||
}
|
||||
|
||||
async function fetchMovies(baseReleasesOrUrls) {
|
||||
return fetchReleases(baseReleasesOrUrls, 'movie');
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchReleases,
|
||||
fetchScenes,
|
||||
fetchMovies,
|
||||
};
|
||||
|
||||
507
src/releases-legacy.js
Normal file
507
src/releases-legacy.js
Normal file
@@ -0,0 +1,507 @@
|
||||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
const moment = require('moment');
|
||||
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const argv = require('./argv');
|
||||
const whereOr = require('./utils/where-or');
|
||||
const { associateTags } = require('./tags');
|
||||
const { associateActors, scrapeBasicActors } = require('./actors');
|
||||
const {
|
||||
pluckItems,
|
||||
storeMedia,
|
||||
associateMedia,
|
||||
} = require('./media');
|
||||
const { fetchSites } = require('./sites');
|
||||
const slugify = require('./utils/slugify');
|
||||
const capitalize = require('./utils/capitalize');
|
||||
|
||||
function commonQuery(queryBuilder, {
|
||||
filter = [],
|
||||
after = new Date(0), // January 1970
|
||||
before = new Date(2 ** 44), // May 2109
|
||||
limit = 100,
|
||||
}) {
|
||||
const finalFilter = [].concat(filter); // ensure filter is array
|
||||
|
||||
queryBuilder
|
||||
.leftJoin('sites', 'releases.site_id', 'sites.id')
|
||||
.leftJoin('studios', 'releases.studio_id', 'studios.id')
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.select(
|
||||
'releases.*',
|
||||
'sites.name as site_name', 'sites.slug as site_slug', 'sites.url as site_url', 'sites.network_id', 'sites.parameters as site_parameters',
|
||||
'studios.name as studio_name', 'sites.slug as site_slug', 'studios.url as studio_url',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description',
|
||||
)
|
||||
.whereNotExists((builder) => {
|
||||
// apply tag filters
|
||||
builder
|
||||
.select('*')
|
||||
.from('tags_associated')
|
||||
.leftJoin('tags', 'tags_associated.tag_id', 'tags.id')
|
||||
.whereIn('tags.slug', finalFilter)
|
||||
.where('tags_associated.domain', 'releases')
|
||||
.whereRaw('tags_associated.target_id = releases.id');
|
||||
})
|
||||
.andWhere('releases.date', '>', after)
|
||||
.andWhere('releases.date', '<=', before)
|
||||
.orderBy([{ column: 'date', order: 'desc' }, { column: 'created_at', order: 'desc' }])
|
||||
.limit(limit);
|
||||
}
|
||||
|
||||
async function curateRelease(release) {
|
||||
const [actors, tags, media] = await Promise.all([
|
||||
knex('actors_associated')
|
||||
.select(
|
||||
'actors.id', 'actors.name', 'actors.gender', 'actors.slug', 'actors.birthdate',
|
||||
'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name', 'birth_countries.alias as birth_country_alias',
|
||||
'media.thumbnail as avatar',
|
||||
)
|
||||
.where({ release_id: release.id })
|
||||
.leftJoin('actors', 'actors.id', 'actors_associated.actor_id')
|
||||
.leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2')
|
||||
.leftJoin('media', (builder) => {
|
||||
builder
|
||||
.on('media.target_id', 'actors.id')
|
||||
.andOnVal('media.domain', 'actors')
|
||||
.andOnVal('media.index', '0');
|
||||
})
|
||||
.orderBy('actors.gender'),
|
||||
knex('tags_associated')
|
||||
.select('tags.name', 'tags.slug')
|
||||
.where({
|
||||
domain: 'releases',
|
||||
target_id: release.id,
|
||||
})
|
||||
.leftJoin('tags', 'tags.id', 'tags_associated.tag_id')
|
||||
.orderBy('tags.priority', 'desc'),
|
||||
knex('media')
|
||||
.where({
|
||||
target_id: release.id,
|
||||
domain: 'releases',
|
||||
})
|
||||
.orderBy(['role', 'index']),
|
||||
]);
|
||||
|
||||
const curatedRelease = {
|
||||
id: release.id,
|
||||
type: release.type,
|
||||
title: release.title,
|
||||
date: release.date,
|
||||
dateAdded: release.created_at,
|
||||
description: release.description,
|
||||
url: release.url,
|
||||
shootId: release.shoot_id,
|
||||
entryId: release.entry_id,
|
||||
actors: actors.map(actor => ({
|
||||
id: actor.id,
|
||||
slug: actor.slug,
|
||||
name: actor.name,
|
||||
gender: actor.gender,
|
||||
birthdate: actor.birthdate,
|
||||
age: moment().diff(actor.birthdate, 'years'),
|
||||
ageThen: moment(release.date).diff(actor.birthdate, 'years'),
|
||||
avatar: actor.avatar,
|
||||
origin: actor.birth_country_alpha2
|
||||
? {
|
||||
country: {
|
||||
name: actor.birth_country_alias,
|
||||
alpha2: actor.birth_country_alpha2,
|
||||
},
|
||||
}
|
||||
: null,
|
||||
})),
|
||||
director: release.director,
|
||||
tags,
|
||||
duration: release.duration,
|
||||
photos: media.filter(item => item.role === 'photo'),
|
||||
poster: media.filter(item => item.role === 'poster')[0],
|
||||
covers: media.filter(item => item.role === 'cover'),
|
||||
trailer: media.filter(item => item.role === 'trailer')[0],
|
||||
site: {
|
||||
id: release.site_id,
|
||||
name: release.site_name,
|
||||
independent: !!release.site_parameters?.independent,
|
||||
slug: release.site_slug,
|
||||
url: release.site_url,
|
||||
},
|
||||
studio: release.studio_id
|
||||
? {
|
||||
id: release.studio_id,
|
||||
name: release.studio_name,
|
||||
slug: release.studio_slug,
|
||||
url: release.studio_url,
|
||||
}
|
||||
: null,
|
||||
network: {
|
||||
id: release.network_id,
|
||||
name: release.network_name,
|
||||
description: release.network_description,
|
||||
slug: release.network_slug,
|
||||
url: release.network_url,
|
||||
},
|
||||
};
|
||||
|
||||
return curatedRelease;
|
||||
}
|
||||
|
||||
function curateReleases(releases) {
|
||||
return Promise.all(releases.map(async release => curateRelease(release)));
|
||||
}
|
||||
|
||||
async function attachChannelSite(release) {
|
||||
if (!release.site?.isFallback && !release.channel?.force) {
|
||||
return release;
|
||||
}
|
||||
|
||||
if (!release.channel) {
|
||||
throw new Error(`Unable to derive channel site from generic URL: ${release.url}`);
|
||||
}
|
||||
|
||||
const [site] = await fetchSites({
|
||||
name: release.channel.name || release.channel,
|
||||
slug: release.channel.slug || release.channel,
|
||||
});
|
||||
|
||||
if (site) {
|
||||
return {
|
||||
...release,
|
||||
site,
|
||||
};
|
||||
}
|
||||
|
||||
throw new Error(`Unable to match channel '${release.channel.slug || release.channel}' from generic URL: ${release.url}`);
|
||||
}
|
||||
|
||||
async function attachStudio(release) {
|
||||
if (!release.studio) {
|
||||
return release;
|
||||
}
|
||||
|
||||
const studio = await knex('studios')
|
||||
.where('name', release.studio)
|
||||
.orWhere('slug', release.studio)
|
||||
.orWhere('url', release.studio)
|
||||
.first();
|
||||
|
||||
return {
|
||||
...release,
|
||||
studio,
|
||||
};
|
||||
}
|
||||
|
||||
async function curateReleaseEntry(release, batchId, existingRelease) {
|
||||
const slug = slugify(release.title, {
|
||||
encode: true,
|
||||
limit: config.titleSlugLength,
|
||||
});
|
||||
|
||||
const curatedRelease = {
|
||||
site_id: release.site.id,
|
||||
studio_id: release.studio ? release.studio.id : null,
|
||||
shoot_id: release.shootId || null,
|
||||
entry_id: release.entryId || null,
|
||||
type: release.type,
|
||||
url: release.url,
|
||||
title: release.title,
|
||||
slug,
|
||||
date: release.date,
|
||||
description: release.description,
|
||||
// director: release.director,
|
||||
duration: release.duration,
|
||||
// likes: release.rating && release.rating.likes,
|
||||
// dislikes: release.rating && release.rating.dislikes,
|
||||
// rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
|
||||
deep: typeof release.deep === 'boolean' ? release.deep : false,
|
||||
deep_url: release.deepUrl,
|
||||
updated_batch_id: batchId,
|
||||
...(!existingRelease && { created_batch_id: batchId }),
|
||||
};
|
||||
|
||||
return curatedRelease;
|
||||
}
|
||||
|
||||
async function fetchReleases(queryObject = {}, options = {}) {
|
||||
const releases = await knex('releases')
|
||||
.modify(commonQuery, options)
|
||||
.andWhere(builder => whereOr(queryObject, 'releases', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function fetchSiteReleases(queryObject, options = {}) {
|
||||
const releases = await knex('releases')
|
||||
.modify(commonQuery, options)
|
||||
.where(builder => whereOr(queryObject, 'sites', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function fetchNetworkReleases(queryObject, options = {}) {
|
||||
const releases = await knex('releases')
|
||||
.modify(commonQuery, options)
|
||||
.where(builder => whereOr(queryObject, 'networks', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function fetchActorReleases(queryObject, options = {}) {
|
||||
const releases = await knex('actors_associated')
|
||||
.leftJoin('releases', 'actors_associated.release_id', 'releases.id')
|
||||
.leftJoin('actors', 'actors_associated.actor_id', 'actors.id')
|
||||
.select(
|
||||
'actors.name as actor_name',
|
||||
)
|
||||
.modify(commonQuery, options)
|
||||
.where(builder => whereOr(queryObject, 'actors', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function fetchTagReleases(queryObject, options = {}) {
|
||||
const releases = await knex('tags_associated')
|
||||
.leftJoin('releases', 'tags_associated.target_id', 'releases.id')
|
||||
.leftJoin('tags', 'tags_associated.tag_id', 'tags.id')
|
||||
.select(
|
||||
'tags.name as tag_name',
|
||||
)
|
||||
.modify(commonQuery, options)
|
||||
.where('tags_associated.domain', 'releases')
|
||||
.where(builder => whereOr(queryObject, 'tags', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
function accumulateActors(releases) {
|
||||
return releases.reduce((acc, release) => {
|
||||
if (!Array.isArray(release.actors)) return acc;
|
||||
|
||||
release.actors.forEach((actor) => {
|
||||
const actorName = actor.name ? actor.name.trim() : actor.trim();
|
||||
const actorSlug = slugify(actorName);
|
||||
|
||||
if (!actorSlug) return;
|
||||
|
||||
if (!acc[actorSlug]) {
|
||||
acc[actorSlug] = {
|
||||
name: actorName,
|
||||
slug: actorSlug,
|
||||
releaseIds: new Set(),
|
||||
avatars: [],
|
||||
};
|
||||
}
|
||||
|
||||
acc[actorSlug].releaseIds.add(release.id);
|
||||
|
||||
if (actor.name) acc[actorSlug] = { ...acc[actorSlug], ...actor }; // actor input contains profile info
|
||||
if (actor.avatar) {
|
||||
const avatar = Array.isArray(actor.avatar)
|
||||
? actor.avatar.map(avatarX => ({
|
||||
src: avatarX.src || avatarX,
|
||||
copyright: avatarX.copyright === undefined ? capitalize(release.site?.network?.name) : avatarX.copyright,
|
||||
}))
|
||||
: {
|
||||
src: actor.avatar.src || actor.avatar,
|
||||
copyright: actor.avatar.copyright === undefined ? capitalize(release.site?.network?.name) : actor.avatar.copyright,
|
||||
};
|
||||
|
||||
acc[actorSlug].avatars = acc[actorSlug].avatars.concat([avatar]); // don't flatten fallbacks
|
||||
}
|
||||
});
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
}
|
||||
|
||||
async function storeReleaseAssets(releases) {
|
||||
if (!argv.media) {
|
||||
return;
|
||||
}
|
||||
|
||||
const releasePostersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.poster] }), {});
|
||||
const releaseCoversById = releases.reduce((acc, release) => ({ ...acc, [release.id]: release.covers }), {});
|
||||
const releaseTrailersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.trailer] }), {});
|
||||
const releaseTeasersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.teaser] }), {});
|
||||
const releasePhotosById = releases.reduce((acc, release) => ({
|
||||
...acc,
|
||||
[release.id]: pluckItems(release.photos),
|
||||
}), {});
|
||||
|
||||
if (argv.images && argv.posters) {
|
||||
const posters = await storeMedia(Object.values(releasePostersById).flat(), 'release', 'poster');
|
||||
if (posters) await associateMedia(releasePostersById, posters, 'release', 'poster');
|
||||
}
|
||||
|
||||
if (argv.images && argv.covers) {
|
||||
const covers = await storeMedia(Object.values(releaseCoversById).flat(), 'release', 'cover');
|
||||
if (covers) await associateMedia(releaseCoversById, covers, 'release', 'cover');
|
||||
}
|
||||
|
||||
if (argv.images && argv.photos) {
|
||||
const photos = await storeMedia(Object.values(releasePhotosById).flat(), 'release', 'photo');
|
||||
if (photos) await associateMedia(releasePhotosById, photos, 'release', 'photo');
|
||||
}
|
||||
|
||||
if (argv.videos && argv.trailers) {
|
||||
const trailers = await storeMedia(Object.values(releaseTrailersById).flat(), 'release', 'trailer');
|
||||
if (trailers) await associateMedia(releaseTrailersById, trailers, 'release', 'trailer');
|
||||
}
|
||||
|
||||
if (argv.videos && argv.teasers) {
|
||||
const teasers = await storeMedia(Object.values(releaseTeasersById).flat(), 'release', 'teaser');
|
||||
if (teasers) await associateMedia(releaseTeasersById, teasers, 'release', 'teaser');
|
||||
}
|
||||
}
|
||||
|
||||
async function updateReleasesSearch(releaseIds) {
|
||||
logger.info(`Updating search documents for ${releaseIds ? releaseIds.length : 'all' } releases`);
|
||||
|
||||
const documents = await knex.raw(`
|
||||
SELECT
|
||||
releases.id AS release_id,
|
||||
TO_TSVECTOR(
|
||||
'traxxx',
|
||||
releases.title || ' ' ||
|
||||
networks.name || ' ' ||
|
||||
networks.slug || ' ' ||
|
||||
networks.url || ' ' ||
|
||||
sites.name || ' ' ||
|
||||
sites.slug || ' ' ||
|
||||
COALESCE(sites.url, '') || ' ' ||
|
||||
COALESCE(sites.alias, '') || ' ' ||
|
||||
COALESCE(releases.shoot_id, '') || ' ' ||
|
||||
COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMDD'), '') || ' ' ||
|
||||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
|
||||
STRING_AGG(COALESCE(tags.name, ''), ' ') || ' ' ||
|
||||
STRING_AGG(COALESCE(tags_aliases.name, ''), ' ')
|
||||
) as document
|
||||
FROM releases
|
||||
LEFT JOIN sites ON releases.site_id = sites.id
|
||||
LEFT JOIN networks ON sites.network_id = networks.id
|
||||
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id
|
||||
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id
|
||||
LEFT JOIN actors ON local_actors.actor_id = actors.id
|
||||
LEFT JOIN tags ON local_tags.tag_id = tags.id
|
||||
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for
|
||||
${releaseIds ? 'WHERE releases.id = ANY(?)' : ''}
|
||||
GROUP BY releases.id, sites.name, sites.slug, sites.alias, sites.url, networks.name, networks.slug, networks.url;
|
||||
`, releaseIds && [releaseIds]);
|
||||
|
||||
if (documents.rows?.length > 0) {
|
||||
const query = knex('releases_search').insert(documents.rows).toString();
|
||||
await knex.raw(`${query} ON CONFLICT (release_id) DO UPDATE SET document = EXCLUDED.document`);
|
||||
}
|
||||
}
|
||||
|
||||
async function storeRelease(release, batchId) {
|
||||
if (!release.site) {
|
||||
throw new Error(`Missing site, unable to store "${release.title}" (${release.url})`);
|
||||
}
|
||||
|
||||
if (!release.entryId) {
|
||||
logger.warn(`Missing entry ID, unable to store "${release.title}" (${release.url})`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const existingRelease = await knex('releases')
|
||||
.where({
|
||||
entry_id: release.entryId,
|
||||
site_id: release.site.id,
|
||||
})
|
||||
.first();
|
||||
|
||||
const curatedRelease = await curateReleaseEntry(release, batchId, existingRelease);
|
||||
|
||||
if (existingRelease && !argv.redownload) {
|
||||
return existingRelease;
|
||||
}
|
||||
|
||||
if (existingRelease && argv.redownload) {
|
||||
const [updatedRelease] = await knex('releases')
|
||||
.where('id', existingRelease.id)
|
||||
.update({
|
||||
...existingRelease,
|
||||
...curatedRelease,
|
||||
})
|
||||
.returning('*');
|
||||
|
||||
if (updatedRelease) {
|
||||
await associateTags(release, updatedRelease.id);
|
||||
logger.info(`Updated release "${release.title}" (${existingRelease.id}, ${release.site.name})`);
|
||||
}
|
||||
|
||||
await associateTags(release, existingRelease.id);
|
||||
|
||||
return existingRelease;
|
||||
}
|
||||
|
||||
const [releaseEntry] = await knex('releases')
|
||||
.insert(curatedRelease)
|
||||
.returning('*');
|
||||
|
||||
await associateTags(release, releaseEntry.id);
|
||||
|
||||
logger.info(`Stored release "${release.title}" (${releaseEntry.id}, ${release.site.name})`);
|
||||
|
||||
return releaseEntry;
|
||||
}
|
||||
|
||||
async function storeReleases(releases) {
|
||||
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
|
||||
|
||||
const storedReleases = await Promise.map(releases, async (release) => {
|
||||
try {
|
||||
const releaseWithChannelSite = await attachChannelSite(release);
|
||||
const releaseWithStudio = await attachStudio(releaseWithChannelSite);
|
||||
const storedRelease = await storeRelease(releaseWithStudio, batchId);
|
||||
|
||||
return storedRelease && {
|
||||
id: storedRelease.id,
|
||||
slug: storedRelease.slug,
|
||||
...releaseWithChannelSite,
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error(error);
|
||||
|
||||
return null;
|
||||
}
|
||||
}, {
|
||||
concurrency: 10,
|
||||
}).filter(Boolean);
|
||||
|
||||
logger.info(`Stored ${storedReleases.length} new releases`);
|
||||
|
||||
const actors = accumulateActors(storedReleases);
|
||||
|
||||
await associateActors(actors, storedReleases);
|
||||
|
||||
await Promise.all([
|
||||
// actors need to be stored before generating search
|
||||
updateReleasesSearch(storedReleases.map(release => release.id)),
|
||||
storeReleaseAssets(storedReleases),
|
||||
]);
|
||||
|
||||
if (argv.withProfiles && Object.keys(actors).length > 0) {
|
||||
await scrapeBasicActors();
|
||||
}
|
||||
|
||||
return {
|
||||
releases: storedReleases,
|
||||
actors,
|
||||
};
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchReleases,
|
||||
fetchActorReleases,
|
||||
fetchSiteReleases,
|
||||
fetchNetworkReleases,
|
||||
fetchTagReleases,
|
||||
storeRelease,
|
||||
storeReleases,
|
||||
updateReleasesSearch,
|
||||
};
|
||||
459
src/releases.js
459
src/releases.js
@@ -1,361 +1,7 @@
|
||||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
const moment = require('moment');
|
||||
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const argv = require('./argv');
|
||||
const whereOr = require('./utils/where-or');
|
||||
const { associateTags } = require('./tags');
|
||||
const { associateActors, scrapeBasicActors } = require('./actors');
|
||||
const {
|
||||
pluckItems,
|
||||
storeMedia,
|
||||
associateMedia,
|
||||
} = require('./media');
|
||||
const { fetchSites } = require('./sites');
|
||||
const slugify = require('./utils/slugify');
|
||||
const capitalize = require('./utils/capitalize');
|
||||
|
||||
function commonQuery(queryBuilder, {
|
||||
filter = [],
|
||||
after = new Date(0), // January 1970
|
||||
before = new Date(2 ** 44), // May 2109
|
||||
limit = 100,
|
||||
}) {
|
||||
const finalFilter = [].concat(filter); // ensure filter is array
|
||||
|
||||
queryBuilder
|
||||
.leftJoin('sites', 'releases.site_id', 'sites.id')
|
||||
.leftJoin('studios', 'releases.studio_id', 'studios.id')
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.select(
|
||||
'releases.*',
|
||||
'sites.name as site_name', 'sites.slug as site_slug', 'sites.url as site_url', 'sites.network_id', 'sites.parameters as site_parameters',
|
||||
'studios.name as studio_name', 'sites.slug as site_slug', 'studios.url as studio_url',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description',
|
||||
)
|
||||
.whereNotExists((builder) => {
|
||||
// apply tag filters
|
||||
builder
|
||||
.select('*')
|
||||
.from('tags_associated')
|
||||
.leftJoin('tags', 'tags_associated.tag_id', 'tags.id')
|
||||
.whereIn('tags.slug', finalFilter)
|
||||
.where('tags_associated.domain', 'releases')
|
||||
.whereRaw('tags_associated.target_id = releases.id');
|
||||
})
|
||||
.andWhere('releases.date', '>', after)
|
||||
.andWhere('releases.date', '<=', before)
|
||||
.orderBy([{ column: 'date', order: 'desc' }, { column: 'created_at', order: 'desc' }])
|
||||
.limit(limit);
|
||||
}
|
||||
|
||||
async function curateRelease(release) {
|
||||
const [actors, tags, media] = await Promise.all([
|
||||
knex('actors_associated')
|
||||
.select(
|
||||
'actors.id', 'actors.name', 'actors.gender', 'actors.slug', 'actors.birthdate',
|
||||
'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name', 'birth_countries.alias as birth_country_alias',
|
||||
'media.thumbnail as avatar',
|
||||
)
|
||||
.where({ release_id: release.id })
|
||||
.leftJoin('actors', 'actors.id', 'actors_associated.actor_id')
|
||||
.leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2')
|
||||
.leftJoin('media', (builder) => {
|
||||
builder
|
||||
.on('media.target_id', 'actors.id')
|
||||
.andOnVal('media.domain', 'actors')
|
||||
.andOnVal('media.index', '0');
|
||||
})
|
||||
.orderBy('actors.gender'),
|
||||
knex('tags_associated')
|
||||
.select('tags.name', 'tags.slug')
|
||||
.where({
|
||||
domain: 'releases',
|
||||
target_id: release.id,
|
||||
})
|
||||
.leftJoin('tags', 'tags.id', 'tags_associated.tag_id')
|
||||
.orderBy('tags.priority', 'desc'),
|
||||
knex('media')
|
||||
.where({
|
||||
target_id: release.id,
|
||||
domain: 'releases',
|
||||
})
|
||||
.orderBy(['role', 'index']),
|
||||
]);
|
||||
|
||||
const curatedRelease = {
|
||||
id: release.id,
|
||||
type: release.type,
|
||||
title: release.title,
|
||||
date: release.date,
|
||||
dateAdded: release.created_at,
|
||||
description: release.description,
|
||||
url: release.url,
|
||||
shootId: release.shoot_id,
|
||||
entryId: release.entry_id,
|
||||
actors: actors.map(actor => ({
|
||||
id: actor.id,
|
||||
slug: actor.slug,
|
||||
name: actor.name,
|
||||
gender: actor.gender,
|
||||
birthdate: actor.birthdate,
|
||||
age: moment().diff(actor.birthdate, 'years'),
|
||||
ageThen: moment(release.date).diff(actor.birthdate, 'years'),
|
||||
avatar: actor.avatar,
|
||||
origin: actor.birth_country_alpha2
|
||||
? {
|
||||
country: {
|
||||
name: actor.birth_country_alias,
|
||||
alpha2: actor.birth_country_alpha2,
|
||||
},
|
||||
}
|
||||
: null,
|
||||
})),
|
||||
director: release.director,
|
||||
tags,
|
||||
duration: release.duration,
|
||||
photos: media.filter(item => item.role === 'photo'),
|
||||
poster: media.filter(item => item.role === 'poster')[0],
|
||||
covers: media.filter(item => item.role === 'cover'),
|
||||
trailer: media.filter(item => item.role === 'trailer')[0],
|
||||
site: {
|
||||
id: release.site_id,
|
||||
name: release.site_name,
|
||||
independent: !!release.site_parameters?.independent,
|
||||
slug: release.site_slug,
|
||||
url: release.site_url,
|
||||
},
|
||||
studio: release.studio_id
|
||||
? {
|
||||
id: release.studio_id,
|
||||
name: release.studio_name,
|
||||
slug: release.studio_slug,
|
||||
url: release.studio_url,
|
||||
}
|
||||
: null,
|
||||
network: {
|
||||
id: release.network_id,
|
||||
name: release.network_name,
|
||||
description: release.network_description,
|
||||
slug: release.network_slug,
|
||||
url: release.network_url,
|
||||
},
|
||||
};
|
||||
|
||||
return curatedRelease;
|
||||
}
|
||||
|
||||
function curateReleases(releases) {
|
||||
return Promise.all(releases.map(async release => curateRelease(release)));
|
||||
}
|
||||
|
||||
async function attachChannelSite(release) {
|
||||
if (!release.site?.isFallback && !release.channel?.force) {
|
||||
return release;
|
||||
}
|
||||
|
||||
if (!release.channel) {
|
||||
throw new Error(`Unable to derive channel site from generic URL: ${release.url}`);
|
||||
}
|
||||
|
||||
const [site] = await fetchSites({
|
||||
name: release.channel.name || release.channel,
|
||||
slug: release.channel.slug || release.channel,
|
||||
});
|
||||
|
||||
if (site) {
|
||||
return {
|
||||
...release,
|
||||
site,
|
||||
};
|
||||
}
|
||||
|
||||
throw new Error(`Unable to match channel '${release.channel.slug || release.channel}' from generic URL: ${release.url}`);
|
||||
}
|
||||
|
||||
async function attachStudio(release) {
|
||||
if (!release.studio) {
|
||||
return release;
|
||||
}
|
||||
|
||||
const studio = await knex('studios')
|
||||
.where('name', release.studio)
|
||||
.orWhere('slug', release.studio)
|
||||
.orWhere('url', release.studio)
|
||||
.first();
|
||||
|
||||
return {
|
||||
...release,
|
||||
studio,
|
||||
};
|
||||
}
|
||||
|
||||
async function curateReleaseEntry(release, batchId, existingRelease) {
|
||||
const slug = slugify(release.title, {
|
||||
encode: true,
|
||||
limit: config.titleSlugLength,
|
||||
});
|
||||
|
||||
const curatedRelease = {
|
||||
site_id: release.site.id,
|
||||
studio_id: release.studio ? release.studio.id : null,
|
||||
shoot_id: release.shootId || null,
|
||||
entry_id: release.entryId || null,
|
||||
type: release.type,
|
||||
url: release.url,
|
||||
title: release.title,
|
||||
slug,
|
||||
date: release.date,
|
||||
description: release.description,
|
||||
// director: release.director,
|
||||
duration: release.duration,
|
||||
// likes: release.rating && release.rating.likes,
|
||||
// dislikes: release.rating && release.rating.dislikes,
|
||||
// rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
|
||||
deep: typeof release.deep === 'boolean' ? release.deep : false,
|
||||
deep_url: release.deepUrl,
|
||||
updated_batch_id: batchId,
|
||||
...(!existingRelease && { created_batch_id: batchId }),
|
||||
};
|
||||
|
||||
return curatedRelease;
|
||||
}
|
||||
|
||||
async function fetchReleases(queryObject = {}, options = {}) {
|
||||
const releases = await knex('releases')
|
||||
.modify(commonQuery, options)
|
||||
.andWhere(builder => whereOr(queryObject, 'releases', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function fetchSiteReleases(queryObject, options = {}) {
|
||||
const releases = await knex('releases')
|
||||
.modify(commonQuery, options)
|
||||
.where(builder => whereOr(queryObject, 'sites', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function fetchNetworkReleases(queryObject, options = {}) {
|
||||
const releases = await knex('releases')
|
||||
.modify(commonQuery, options)
|
||||
.where(builder => whereOr(queryObject, 'networks', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function fetchActorReleases(queryObject, options = {}) {
|
||||
const releases = await knex('actors_associated')
|
||||
.leftJoin('releases', 'actors_associated.release_id', 'releases.id')
|
||||
.leftJoin('actors', 'actors_associated.actor_id', 'actors.id')
|
||||
.select(
|
||||
'actors.name as actor_name',
|
||||
)
|
||||
.modify(commonQuery, options)
|
||||
.where(builder => whereOr(queryObject, 'actors', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
async function fetchTagReleases(queryObject, options = {}) {
|
||||
const releases = await knex('tags_associated')
|
||||
.leftJoin('releases', 'tags_associated.target_id', 'releases.id')
|
||||
.leftJoin('tags', 'tags_associated.tag_id', 'tags.id')
|
||||
.select(
|
||||
'tags.name as tag_name',
|
||||
)
|
||||
.modify(commonQuery, options)
|
||||
.where('tags_associated.domain', 'releases')
|
||||
.where(builder => whereOr(queryObject, 'tags', builder));
|
||||
|
||||
return curateReleases(releases);
|
||||
}
|
||||
|
||||
function accumulateActors(releases) {
|
||||
return releases.reduce((acc, release) => {
|
||||
if (!Array.isArray(release.actors)) return acc;
|
||||
|
||||
release.actors.forEach((actor) => {
|
||||
const actorName = actor.name ? actor.name.trim() : actor.trim();
|
||||
const actorSlug = slugify(actorName);
|
||||
|
||||
if (!actorSlug) return;
|
||||
|
||||
if (!acc[actorSlug]) {
|
||||
acc[actorSlug] = {
|
||||
name: actorName,
|
||||
slug: actorSlug,
|
||||
releaseIds: new Set(),
|
||||
avatars: [],
|
||||
};
|
||||
}
|
||||
|
||||
acc[actorSlug].releaseIds.add(release.id);
|
||||
|
||||
if (actor.name) acc[actorSlug] = { ...acc[actorSlug], ...actor }; // actor input contains profile info
|
||||
if (actor.avatar) {
|
||||
const avatar = Array.isArray(actor.avatar)
|
||||
? actor.avatar.map(avatarX => ({
|
||||
src: avatarX.src || avatarX,
|
||||
copyright: avatarX.copyright === undefined ? capitalize(release.site?.network?.name) : avatarX.copyright,
|
||||
}))
|
||||
: {
|
||||
src: actor.avatar.src || actor.avatar,
|
||||
copyright: actor.avatar.copyright === undefined ? capitalize(release.site?.network?.name) : actor.avatar.copyright,
|
||||
};
|
||||
|
||||
acc[actorSlug].avatars = acc[actorSlug].avatars.concat([avatar]); // don't flatten fallbacks
|
||||
}
|
||||
});
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
}
|
||||
|
||||
async function storeReleaseAssets(releases) {
|
||||
if (!argv.media) {
|
||||
return;
|
||||
}
|
||||
|
||||
const releasePostersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.poster] }), {});
|
||||
const releaseCoversById = releases.reduce((acc, release) => ({ ...acc, [release.id]: release.covers }), {});
|
||||
const releaseTrailersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.trailer] }), {});
|
||||
const releaseTeasersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.teaser] }), {});
|
||||
const releasePhotosById = releases.reduce((acc, release) => ({
|
||||
...acc,
|
||||
[release.id]: pluckItems(release.photos),
|
||||
}), {});
|
||||
|
||||
if (argv.images && argv.posters) {
|
||||
const posters = await storeMedia(Object.values(releasePostersById).flat(), 'release', 'poster');
|
||||
if (posters) await associateMedia(releasePostersById, posters, 'release', 'poster');
|
||||
}
|
||||
|
||||
if (argv.images && argv.covers) {
|
||||
const covers = await storeMedia(Object.values(releaseCoversById).flat(), 'release', 'cover');
|
||||
if (covers) await associateMedia(releaseCoversById, covers, 'release', 'cover');
|
||||
}
|
||||
|
||||
if (argv.images && argv.photos) {
|
||||
const photos = await storeMedia(Object.values(releasePhotosById).flat(), 'release', 'photo');
|
||||
if (photos) await associateMedia(releasePhotosById, photos, 'release', 'photo');
|
||||
}
|
||||
|
||||
if (argv.videos && argv.trailers) {
|
||||
const trailers = await storeMedia(Object.values(releaseTrailersById).flat(), 'release', 'trailer');
|
||||
if (trailers) await associateMedia(releaseTrailersById, trailers, 'release', 'trailer');
|
||||
}
|
||||
|
||||
if (argv.videos && argv.teasers) {
|
||||
const teasers = await storeMedia(Object.values(releaseTeasersById).flat(), 'release', 'teaser');
|
||||
if (teasers) await associateMedia(releaseTeasersById, teasers, 'release', 'teaser');
|
||||
}
|
||||
}
|
||||
|
||||
async function updateReleasesSearch(releaseIds) {
|
||||
logger.info(`Updating search documents for ${releaseIds ? releaseIds.length : 'all' } releases`);
|
||||
@@ -397,111 +43,6 @@ async function updateReleasesSearch(releaseIds) {
|
||||
}
|
||||
}
|
||||
|
||||
async function storeRelease(release, batchId) {
|
||||
if (!release.site) {
|
||||
throw new Error(`Missing site, unable to store "${release.title}" (${release.url})`);
|
||||
}
|
||||
|
||||
if (!release.entryId) {
|
||||
logger.warn(`Missing entry ID, unable to store "${release.title}" (${release.url})`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const existingRelease = await knex('releases')
|
||||
.where({
|
||||
entry_id: release.entryId,
|
||||
site_id: release.site.id,
|
||||
})
|
||||
.first();
|
||||
|
||||
const curatedRelease = await curateReleaseEntry(release, batchId, existingRelease);
|
||||
|
||||
if (existingRelease && !argv.redownload) {
|
||||
return existingRelease;
|
||||
}
|
||||
|
||||
if (existingRelease && argv.redownload) {
|
||||
const [updatedRelease] = await knex('releases')
|
||||
.where('id', existingRelease.id)
|
||||
.update({
|
||||
...existingRelease,
|
||||
...curatedRelease,
|
||||
})
|
||||
.returning('*');
|
||||
|
||||
if (updatedRelease) {
|
||||
await associateTags(release, updatedRelease.id);
|
||||
logger.info(`Updated release "${release.title}" (${existingRelease.id}, ${release.site.name})`);
|
||||
}
|
||||
|
||||
await associateTags(release, existingRelease.id);
|
||||
|
||||
return existingRelease;
|
||||
}
|
||||
|
||||
const [releaseEntry] = await knex('releases')
|
||||
.insert(curatedRelease)
|
||||
.returning('*');
|
||||
|
||||
await associateTags(release, releaseEntry.id);
|
||||
|
||||
logger.info(`Stored release "${release.title}" (${releaseEntry.id}, ${release.site.name})`);
|
||||
|
||||
return releaseEntry;
|
||||
}
|
||||
|
||||
async function storeReleases(releases) {
|
||||
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
|
||||
|
||||
const storedReleases = await Promise.map(releases, async (release) => {
|
||||
try {
|
||||
const releaseWithChannelSite = await attachChannelSite(release);
|
||||
const releaseWithStudio = await attachStudio(releaseWithChannelSite);
|
||||
const storedRelease = await storeRelease(releaseWithStudio, batchId);
|
||||
|
||||
return storedRelease && {
|
||||
id: storedRelease.id,
|
||||
slug: storedRelease.slug,
|
||||
...releaseWithChannelSite,
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error(error);
|
||||
|
||||
return null;
|
||||
}
|
||||
}, {
|
||||
concurrency: 10,
|
||||
}).filter(Boolean);
|
||||
|
||||
logger.info(`Stored ${storedReleases.length} new releases`);
|
||||
|
||||
const actors = accumulateActors(storedReleases);
|
||||
|
||||
await associateActors(actors, storedReleases);
|
||||
|
||||
await Promise.all([
|
||||
// actors need to be stored before generating search
|
||||
updateReleasesSearch(storedReleases.map(release => release.id)),
|
||||
storeReleaseAssets(storedReleases),
|
||||
]);
|
||||
|
||||
if (argv.withProfiles && Object.keys(actors).length > 0) {
|
||||
await scrapeBasicActors();
|
||||
}
|
||||
|
||||
return {
|
||||
releases: storedReleases,
|
||||
actors,
|
||||
};
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchReleases,
|
||||
fetchActorReleases,
|
||||
fetchSiteReleases,
|
||||
fetchNetworkReleases,
|
||||
fetchTagReleases,
|
||||
storeRelease,
|
||||
storeReleases,
|
||||
updateReleasesSearch,
|
||||
};
|
||||
|
||||
@@ -282,7 +282,7 @@ function scrapeMovie({ el, qu }, url, site) {
|
||||
movie.entryId = qu.q('.dvd_details_overview .rating_box').dataset.id;
|
||||
movie.title = qu.q('.title_bar span', true);
|
||||
movie.covers = qu.urls('#dvd-cover-flip > a');
|
||||
movie.channel = qu.q('.update_date a', true);
|
||||
movie.channel = slugify(qu.q('.update_date a', true), '');
|
||||
|
||||
// movie.releases = Array.from(document.querySelectorAll('.cell.dvd_info > a'), el => el.href);
|
||||
const sceneQus = ctxa(el, '.dvd_details');
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
'use strict';
|
||||
|
||||
/* eslint-disable newline-per-chained-call */
|
||||
const bhttp = require('bhttp');
|
||||
const { JSDOM } = require('jsdom');
|
||||
const cheerio = require('cheerio');
|
||||
const moment = require('moment');
|
||||
|
||||
const { get } = require('../utils/http');
|
||||
|
||||
const descriptionTags = {
|
||||
'anal cream pie': 'anal creampie',
|
||||
'ass to mouth': 'ass to mouth',
|
||||
@@ -85,26 +86,29 @@ async function scrapeLatestA(html, site) {
|
||||
}));
|
||||
}
|
||||
|
||||
async function scrapeLatestB(html, site) {
|
||||
async function scrapeLatestB(html) {
|
||||
const { document } = new JSDOM(html).window;
|
||||
const sceneElements = document.querySelectorAll('.content-border');
|
||||
|
||||
return Promise.all(Array.from(sceneElements, async (element) => {
|
||||
const $ = cheerio.load(element.innerHTML, { normalizeWhitespace: true });
|
||||
const release = {
|
||||
director: 'Mike Adriano',
|
||||
};
|
||||
|
||||
const titleElement = element.querySelector('.content-title-wrap a');
|
||||
const title = titleElement.title || titleElement.textContent.trim();
|
||||
const url = titleElement.href;
|
||||
const entryId = url.split('/').slice(-2)[0];
|
||||
release.title = titleElement.title || titleElement.textContent.trim();
|
||||
release.url = titleElement.href;
|
||||
release.entryId = release.url.split('/').slice(-2)[0];
|
||||
|
||||
const description = element.querySelector('.content-description').textContent.trim();
|
||||
const date = (moment(element.querySelector('.mobile-date').textContent, 'MM/DD/YYYY')
|
||||
release.description = element.querySelector('.content-description').textContent.trim();
|
||||
release.date = (moment(element.querySelector('.mobile-date').textContent, 'MM/DD/YYYY')
|
||||
|| moment(element.querySelector('.date').textContent, 'Do MMM YYYY')).toDate();
|
||||
const actors = Array.from(element.querySelectorAll('.content-models a'), actorElement => actorElement.textContent);
|
||||
release.actors = Array.from(element.querySelectorAll('.content-models a'), actorElement => actorElement.textContent);
|
||||
|
||||
const durationString = element.querySelector('.total-time').textContent.trim();
|
||||
// timestamp is somethines 00:00, sometimes 0:00:00
|
||||
const duration = durationString.split(':').length === 3
|
||||
release.duration = durationString.split(':').length === 3
|
||||
? moment.duration(durationString).asSeconds()
|
||||
: moment.duration(`00:${durationString}`).asSeconds();
|
||||
|
||||
@@ -114,65 +118,44 @@ async function scrapeLatestB(html, site) {
|
||||
.toArray()
|
||||
.map(photoUrl => photoUrl.slice(photoUrl.indexOf('http'), photoUrl.indexOf('.jpg') + 4));
|
||||
|
||||
const photos = [...primaryPhotos, ...secondaryPhotos];
|
||||
const tags = deriveTagsFromDescription(description);
|
||||
release.poster = poster;
|
||||
release.photos = [...primaryPhotos, ...secondaryPhotos];
|
||||
|
||||
return {
|
||||
url,
|
||||
entryId,
|
||||
title,
|
||||
description,
|
||||
actors,
|
||||
director: 'Mike Adriano',
|
||||
date,
|
||||
duration,
|
||||
tags,
|
||||
poster,
|
||||
photos,
|
||||
site,
|
||||
};
|
||||
release.tags = deriveTagsFromDescription(release.description);
|
||||
return release;
|
||||
}));
|
||||
}
|
||||
|
||||
async function scrapeSceneA(html, url, site) {
|
||||
async function scrapeSceneA(html, url) {
|
||||
const { document } = new JSDOM(html).window;
|
||||
const element = document.querySelector('.content-page-info');
|
||||
const release = {
|
||||
url,
|
||||
director: 'Mike Adriano',
|
||||
};
|
||||
|
||||
const entryId = url.split('/').slice(-2)[0];
|
||||
const title = element.querySelector('.title').textContent.trim();
|
||||
const description = element.querySelector('.desc').textContent.trim();
|
||||
const date = moment(element.querySelector('.post-date').textContent.trim(), 'Do MMM YYYY').toDate();
|
||||
release.entryId = url.split('/').slice(-2)[0];
|
||||
release.title = element.querySelector('.title').textContent.trim();
|
||||
release.description = element.querySelector('.desc').textContent.trim();
|
||||
release.date = moment(element.querySelector('.post-date').textContent.trim(), 'Do MMM YYYY').toDate();
|
||||
|
||||
const actors = Array.from(element.querySelectorAll('.models a'), actorElement => actorElement.textContent);
|
||||
release.actors = Array.from(element.querySelectorAll('.models a'), actorElement => actorElement.textContent);
|
||||
|
||||
const durationString = element.querySelector('.total-time').textContent.trim();
|
||||
// timestamp is sometimes 00:00, sometimes 0:00:00
|
||||
const duration = durationString.split(':').length === 3
|
||||
release.duration = durationString.split(':').length === 3
|
||||
? moment.duration(durationString).asSeconds()
|
||||
: moment.duration(`00:${durationString}`).asSeconds();
|
||||
|
||||
const { poster } = document.querySelector('.content-page-header video');
|
||||
const { src, type } = document.querySelector('.content-page-header source');
|
||||
|
||||
const tags = deriveTagsFromDescription(description);
|
||||
release.poster = poster;
|
||||
release.trailer = { src, type };
|
||||
|
||||
return {
|
||||
url,
|
||||
entryId,
|
||||
title,
|
||||
description,
|
||||
actors,
|
||||
director: 'Mike Adriano',
|
||||
date,
|
||||
duration,
|
||||
tags,
|
||||
poster,
|
||||
trailer: {
|
||||
src,
|
||||
type,
|
||||
},
|
||||
site,
|
||||
};
|
||||
release.tags = deriveTagsFromDescription(release.description);
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function scrapeSceneB(html, url, site) {
|
||||
@@ -220,25 +203,34 @@ async function scrapeSceneB(html, url, site) {
|
||||
|
||||
async function fetchLatest(site, page = 1) {
|
||||
const { host } = new URL(site.url);
|
||||
const url = `https://tour.${host}/videos?page=${page}`;
|
||||
|
||||
const res = await bhttp.get(`https://tour.${host}/videos?page=${page}`);
|
||||
const res = await get(url);
|
||||
|
||||
if (host === 'trueanal.com' || host === 'swallowed.com') {
|
||||
return scrapeLatestA(res.body.toString(), site);
|
||||
if (res.code === 200) {
|
||||
if (host === 'trueanal.com' || host === 'swallowed.com') {
|
||||
return scrapeLatestA(res.html, site);
|
||||
}
|
||||
|
||||
return scrapeLatestB(res.html, site);
|
||||
}
|
||||
|
||||
return scrapeLatestB(res.body.toString(), site);
|
||||
return res.code;
|
||||
}
|
||||
|
||||
async function fetchScene(url, site) {
|
||||
const { host } = new URL(site.url);
|
||||
const res = await bhttp.get(url);
|
||||
const res = await get(url);
|
||||
|
||||
if (host === 'trueanal.com' || host === 'swallowed.com') {
|
||||
return scrapeSceneA(res.body.toString(), url, site);
|
||||
if (res.code === 200) {
|
||||
if (host === 'trueanal.com' || host === 'swallowed.com') {
|
||||
return scrapeSceneA(res.body.toString(), url, site);
|
||||
}
|
||||
|
||||
return scrapeSceneB(res.body.toString(), url, site);
|
||||
}
|
||||
|
||||
return scrapeSceneB(res.body.toString(), url, site);
|
||||
return res.code;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
const config = require('config');
|
||||
|
||||
const argv = require('./argv');
|
||||
const logger = require('./logger');
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const slugify = require('./utils/slugify');
|
||||
|
||||
@@ -98,9 +98,10 @@ async function extractUniqueReleases(releases) {
|
||||
.whereIn(['entry_id', 'site_id'], releases.map(release => [release.entryId, release.site.id]));
|
||||
|
||||
const duplicateReleaseEntryKeys = new Set(duplicateReleaseEntries.map(releaseEntry => `${releaseEntry.site_id}_${releaseEntry.entry_id}`));
|
||||
const duplicateReleases = releases.filter(release => duplicateReleaseEntryKeys.has(`${release.site.id}_${release.entryId}`));
|
||||
const uniqueReleases = releases.filter(release => !duplicateReleaseEntryKeys.has(`${release.site.id}_${release.entryId}`));
|
||||
|
||||
return uniqueReleases;
|
||||
return { duplicateReleases, uniqueReleases };
|
||||
}
|
||||
|
||||
async function storeReleases(releases) {
|
||||
@@ -110,14 +111,19 @@ async function storeReleases(releases) {
|
||||
const releasesWithStudios = await attachStudios(releasesWithSites);
|
||||
|
||||
// uniqueness is site ID + entry ID, filter uniques after adding sites
|
||||
const uniqueReleases = argv.redownload
|
||||
? releasesWithStudios
|
||||
: await extractUniqueReleases(releasesWithStudios);
|
||||
const { uniqueReleases, duplicateReleases } = await extractUniqueReleases(releasesWithStudios);
|
||||
|
||||
const curatedReleaseEntries = uniqueReleases.slice(0, 2).map(release => curateReleaseEntry(release, batchId));
|
||||
console.log(argv.redownload, duplicateReleases);
|
||||
|
||||
const curatedReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId));
|
||||
const storedReleases = await knex('releases').insert(curatedReleaseEntries).returning('*');
|
||||
|
||||
console.log(storedReleases);
|
||||
if (Array.isArray(storedReleases)) {
|
||||
return storedReleases;
|
||||
}
|
||||
|
||||
// nothing inserted
|
||||
return [];
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
|
||||
@@ -26,23 +26,33 @@ const afterDate = (() => {
|
||||
})();
|
||||
|
||||
async function extractUniqueReleases(latestReleases, accReleases) {
|
||||
const latestReleaseEntryIds = latestReleases.map(release => release.entryId);
|
||||
const duplicateReleases = await knex('releases')
|
||||
.whereIn('entry_id', latestReleaseEntryIds);
|
||||
const latestReleaseIdentifiers = latestReleases
|
||||
.map(release => [release.site.id, release.entryId]);
|
||||
|
||||
// add entry IDs of accumulated releases to prevent an infinite loop
|
||||
const duplicateReleases = await knex('releases')
|
||||
.whereIn(['site_id', 'entry_id'], latestReleaseIdentifiers);
|
||||
|
||||
// add entry IDs of accumulated releases to prevent an infinite scrape loop
|
||||
// when one page contains the same release as the previous
|
||||
const duplicateReleaseEntryIds = new Set(duplicateReleases
|
||||
.map(release => String(release.entry_id))
|
||||
.concat(accReleases.map(release => String(release.entryId))));
|
||||
const duplicateReleaseIdentifiers = duplicateReleases
|
||||
.concat(accReleases)
|
||||
.reduce((acc, release) => {
|
||||
const siteId = release.site_id || release.site.id;
|
||||
const entryId = release.entry_id || release.entryId;
|
||||
|
||||
if (!acc[siteId]) acc[siteId] = {};
|
||||
acc[siteId][entryId] = true;
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
const uniqueReleases = latestReleases
|
||||
.filter(release => !duplicateReleaseEntryIds.has(String(release.entryId)));
|
||||
.filter(release => !duplicateReleaseIdentifiers[release.site.id]?.[release.entryId]);
|
||||
|
||||
return uniqueReleases;
|
||||
}
|
||||
|
||||
function getNextPage(uniqueReleases, pageAccReleases, oldestReleaseOnPage) {
|
||||
function needNextPage(uniqueReleases, pageAccReleases) {
|
||||
if (uniqueReleases === 0) {
|
||||
return false;
|
||||
}
|
||||
@@ -52,9 +62,13 @@ function getNextPage(uniqueReleases, pageAccReleases, oldestReleaseOnPage) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (oldestReleaseOnPage && moment(oldestReleaseOnPage.date).isAfter(afterDate)) {
|
||||
// oldest release on page is newer than the specified date cut-off
|
||||
return true;
|
||||
const oldestReleaseOnPage = uniqueReleases
|
||||
.sort((releaseA, releaseB) => releaseB.date - releaseA.date)
|
||||
.slice(-1)[0];
|
||||
|
||||
if (oldestReleaseOnPage && moment(oldestReleaseOnPage.date).isAfter(afterDate)) {
|
||||
// oldest release on page is newer than the specified date cut-off
|
||||
return true;
|
||||
}
|
||||
|
||||
// dates missing, and limit for scenes without dates not yet reached
|
||||
@@ -81,7 +95,6 @@ async function scrapeLatestReleases(scraper, site, preData) {
|
||||
}
|
||||
|
||||
const latestReleasesWithSite = latestReleases.map(release => ({ ...release, site: release.site || site })); // attach site release is assigned to when stored
|
||||
const oldestReleaseOnPage = latestReleases.sort((releaseA, releaseB) => releaseB.date - releaseA.date).slice(-1)[0];
|
||||
|
||||
const uniqueReleases = argv.redownload
|
||||
? latestReleasesWithSite
|
||||
@@ -91,25 +104,25 @@ async function scrapeLatestReleases(scraper, site, preData) {
|
||||
|
||||
logger.verbose(`Scraped '${site.name}' (${site.network.name}) page ${page}, found ${uniqueReleases.length} unique releases`);
|
||||
|
||||
if (getNextPage(uniqueReleases, pageAccReleases, oldestReleaseOnPage)) {
|
||||
return scrapePage(page + 1, accReleases.concat(uniqueReleases));
|
||||
if (needNextPage(uniqueReleases, pageAccReleases)) {
|
||||
return scrapePage(page + 1, pageAccReleases);
|
||||
}
|
||||
|
||||
if (argv.last) {
|
||||
return pageAccReleases.slice(0, argv.last);
|
||||
}
|
||||
|
||||
if (oldestReleaseOnPage) {
|
||||
const recentReleases = uniqueReleases
|
||||
.filter(release => moment(release.date).isAfter(afterDate));
|
||||
|
||||
return accReleases.concat(recentReleases);
|
||||
}
|
||||
|
||||
return pageAccReleases.slice(0, argv.nullDateLimit);
|
||||
return pageAccReleases;
|
||||
};
|
||||
|
||||
return scrapePage(1, []);
|
||||
const releases = await scrapePage(1, []);
|
||||
|
||||
if (argv.last) {
|
||||
return releases.slice(0, argv.last);
|
||||
}
|
||||
|
||||
if (releases.every(release => release.date)) {
|
||||
return releases
|
||||
.filter(release => moment(release.date).isAfter(afterDate));
|
||||
}
|
||||
|
||||
return releases.slice(0, argv.nullDateLimit);
|
||||
}
|
||||
|
||||
async function scrapeUpcomingReleases(scraper, site, preData) {
|
||||
|
||||
@@ -7,6 +7,14 @@ const taskQueue = require('promise-task-queue');
|
||||
|
||||
const logger = require('../logger')(__filename);
|
||||
|
||||
const defaultHeaders = {
|
||||
'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1',
|
||||
};
|
||||
|
||||
const defaultOptions = {
|
||||
responseTimeout: 30000,
|
||||
};
|
||||
|
||||
const proxyAgent = tunnel.httpsOverHttp({
|
||||
proxy: {
|
||||
host: config.proxy.host,
|
||||
@@ -25,19 +33,15 @@ function useProxy(url) {
|
||||
|
||||
const queue = taskQueue();
|
||||
|
||||
queue.on('concurrencyReached:httpGet', () => {
|
||||
logger.silly('Queueing GET requests');
|
||||
});
|
||||
|
||||
queue.on('concurrencyReached:httpPost', () => {
|
||||
logger.silly('Queueing POST requests');
|
||||
queue.on('concurrencyReached:http', () => {
|
||||
logger.silly('Queueing requests');
|
||||
});
|
||||
|
||||
queue.define('http', async ({
|
||||
url,
|
||||
method = 'GET',
|
||||
body,
|
||||
timeout = 30000,
|
||||
headers = {},
|
||||
options = {},
|
||||
}) => {
|
||||
if (body) {
|
||||
@@ -47,8 +51,13 @@ queue.define('http', async ({
|
||||
}
|
||||
|
||||
const reqOptions = {
|
||||
responseTimeout: timeout,
|
||||
headers: {
|
||||
...headers,
|
||||
...defaultHeaders,
|
||||
},
|
||||
...options,
|
||||
...defaultOptions,
|
||||
...(options.timeout && { responseTimeout: options.timeout }),
|
||||
};
|
||||
|
||||
if (useProxy(url)) {
|
||||
@@ -59,26 +68,33 @@ queue.define('http', async ({
|
||||
? await bhttp[method.toLowerCase()](url, body, reqOptions)
|
||||
: await bhttp[method.toLowerCase()](url, reqOptions);
|
||||
|
||||
const html = Buffer.isBuffer(res.body) ? res.body.toString() : null;
|
||||
const json = Buffer.isBuffer(res.body) ? null : res.body;
|
||||
|
||||
return {
|
||||
...res,
|
||||
html,
|
||||
json,
|
||||
code: res.statusCode,
|
||||
};
|
||||
}, {
|
||||
concurrency: 20,
|
||||
});
|
||||
|
||||
async function get(url, options) {
|
||||
async function get(url, headers, options) {
|
||||
return queue.push('http', {
|
||||
method: 'get',
|
||||
url,
|
||||
headers,
|
||||
options,
|
||||
});
|
||||
}
|
||||
|
||||
async function post(url, body, options) {
|
||||
async function post(url, body, headers, options) {
|
||||
return queue.push('http', {
|
||||
url,
|
||||
body,
|
||||
headers,
|
||||
options,
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user