traxxx/src/fetch-releases.js

361 lines
13 KiB
JavaScript
Raw Normal View History

'use strict';
const config = require('config');
2019-05-08 03:50:13 +00:00
const fs = require('fs-extra');
const path = require('path');
const Promise = require('bluebird');
const moment = require('moment');
const mime = require('mime');
2019-05-08 03:50:13 +00:00
const bhttp = require('bhttp');
const argv = require('./argv');
const knex = require('./knex');
const scrapers = require('./scrapers');
const fetchScene = require('./fetch-scene');
function destructConfigNetworks(networks) {
return networks.reduce((acc, network) => {
if (Array.isArray(network)) {
// network specifies sites
return {
...acc,
sites: [...acc.sites, ...network[1]],
};
}
return {
...acc,
networks: [...acc.networks, network],
};
}, {
networks: [],
sites: [],
});
}
function curateSites(sites) {
return sites.map(site => ({
id: site.id,
name: site.name,
2019-09-10 14:48:04 +00:00
slug: site.slug,
description: site.description,
url: site.url,
network: {
id: site.network_id,
name: site.network_name,
2019-09-10 14:48:04 +00:00
slug: site.network_slug,
},
parameters: JSON.parse(site.parameters),
}));
}
async function accumulateIncludedSites() {
if (argv.networks || argv.sites) {
2019-09-10 14:48:04 +00:00
const networks = await knex('networks').select('id').whereIn('slug', argv.networks || []);
const networkIds = networks.map(network => network.id);
const rawSites = await knex('sites')
2019-09-10 14:48:04 +00:00
.select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug')
.whereIn('sites.slug', argv.sites || [])
.orWhereIn('network_id', networkIds)
.leftJoin('networks', 'sites.network_id', 'networks.id');
return curateSites(rawSites);
}
const included = destructConfigNetworks(config.include);
2019-09-10 14:48:04 +00:00
const networks = await knex('networks').select('id').whereIn('slug', included.networks || []);
const networkIds = networks.map(network => network.id);
const rawSites = await knex('sites')
.select('sites.*', 'networks.name as network_name')
2019-09-10 14:48:04 +00:00
.whereIn('sites.slug', included.sites || [])
.orWhereIn('network_id', networkIds)
.leftJoin('networks', 'sites.network_id', 'networks.id');
return curateSites(rawSites);
}
async function findDuplicateReleases(latestReleases, _siteId) {
const latestReleasesShootIds = latestReleases.map(release => release.shootId).filter(release => release !== undefined);
const latestReleasesEntryIds = latestReleases.map(release => release.entryId).filter(release => release !== undefined);
return knex('releases')
.whereIn('shoot_id', latestReleasesShootIds)
.orWhereIn('entry_id', latestReleasesEntryIds);
}
async function storeActors(release, releaseEntry) {
const actors = await knex('actors').whereIn('name', release.actors);
const newActors = release.actors.filter(actorName => !actors.some(actor => actor.name === actorName));
const { rows: insertedActors } = newActors.length
? await knex.raw(`${knex('actors').insert(newActors.map(actorName => ({
name: actorName,
slug: actorName.toLowerCase().replace(/\s+/g, '-'),
})))} ON CONFLICT DO NOTHING RETURNING *`)
: { rows: [] };
return knex('actors_associated').insert(actors.concat(insertedActors).map(actor => ({
release_id: releaseEntry.id,
actor_id: actor.id,
})), '*');
}
async function storeTags(release, releaseEntry) {
return knex('tags_associated').insert(release.tags.map(tagId => ({
tag_id: tagId,
release_id: releaseEntry.id,
})));
}
async function storePhotos(release, releaseEntry) {
console.log(`Storing ${release.photos.length} photos for (${release.site.name}, ${releaseEntry.id}) "${release.title}"`);
const files = await Promise.map(release.photos, async (photoUrl, index) => {
const { pathname } = new URL(photoUrl);
const mimetype = mime.getType(pathname);
const res = await bhttp.get(photoUrl);
if (res.statusCode === 200) {
const filepath = path.join(release.site.slug, releaseEntry.id.toString(), `${index + 1}.${mime.getExtension(mimetype)}`);
await fs.writeFile(path.join(config.photoPath, filepath), res.body);
return {
filepath,
mimetype,
};
}
console.warn(`Failed to store photo ${index + 1} for (${release.site.name}, ${releaseEntry.id}) "${release.title}": ${res.statusCode}`);
return null;
}, {
concurrency: 2,
});
2019-09-10 14:48:04 +00:00
await knex('media').insert(files.filter(file => file).map(({ filepath, mimetype }, index) => ({
path: filepath,
mime: mimetype,
2019-09-10 14:48:04 +00:00
index,
domain: 'releases',
target_id: releaseEntry.id,
role: 'photo',
2019-09-10 14:48:04 +00:00
})));
}
2019-09-10 14:48:04 +00:00
async function storePoster(release, releaseEntry) {
console.log(`Storing poster for (${release.site.name}, ${releaseEntry.id}) "${release.title}"`);
2019-09-10 14:48:04 +00:00
2019-10-29 00:47:16 +00:00
const res = await bhttp.get(release.poster);
if (res.statusCode === 200) {
const { pathname } = new URL(release.poster);
const mimetype = res.headers['content-type'] || mime.getType(pathname) || 'image/jpeg';
const filepath = path.join(release.site.slug, releaseEntry.id.toString(), `poster.${mime.getExtension(mimetype)}`);
await fs.writeFile(path.join(config.photoPath, filepath), res.body);
await knex('media').insert({
path: filepath,
mime: mimetype,
domain: 'releases',
target_id: releaseEntry.id,
role: 'poster',
});
return;
}
console.warn(`Failed to store poster for (${release.site.name}, ${releaseEntry.id}) "${release.title}": ${res.statusCode}`);
}
async function storeTrailer(release, releaseEntry) {
console.log(`Storing trailer for (${release.site.name}, ${releaseEntry.id}) "${release.title}"`);
const { pathname } = new URL(release.trailer.src);
const mimetype = release.trailer.type || mime.getType(pathname);
const res = await bhttp.get(release.trailer.src);
const filepath = path.join(release.site.slug, releaseEntry.id.toString(), `trailer${release.trailer.quality ? `_${release.trailer.quality}` : ''}.${mime.getExtension(mimetype)}`);
await fs.writeFile(path.join(config.photoPath, filepath), res.body);
await knex('media').insert({
path: filepath,
mime: mimetype,
domain: 'releases',
target_id: releaseEntry.id,
role: 'trailer',
quality: release.trailer.quality || null,
});
}
async function storeReleases(releases = []) {
2019-05-08 03:50:13 +00:00
return Promise.map(releases, async (release) => {
const curatedRelease = {
site_id: release.site.id,
studio_id: release.studio ? release.studio.id : null,
shoot_id: release.shootId || null,
entry_id: release.entryId || null,
url: release.url,
title: release.title,
date: release.date,
description: release.description,
// director: release.director,
duration: release.duration,
2019-09-10 14:48:04 +00:00
// photos: release.photos ? release.photos.length : 0,
likes: release.rating && release.rating.likes,
dislikes: release.rating && release.rating.dislikes,
rating: release.rating && release.rating.stars,
deep: argv.deep,
};
const releaseQuery = `${knex('releases').insert(curatedRelease).toString()} ON CONFLICT DO NOTHING RETURNING *`;
const releaseEntry = await knex.raw(releaseQuery);
if (releaseEntry.rows.length > 0) {
console.log(`Stored (${release.site.name}, ${releaseEntry.rows[0].id}) "${release.title}"`);
if (release.poster || (release.photos && release.photos.length)) {
await fs.mkdir(path.join(config.photoPath, release.site.slug, releaseEntry.rows[0].id.toString()), { recursive: true });
}
await Promise.all([
release.actors && release.actors.length > 0
? storeActors(release, releaseEntry.rows[0]) : Promise.resolve(),
release.tags && release.tags.length > 0
? storeTags(release, releaseEntry.rows[0]) : Promise.resolve(),
release.photos && release.photos.length > 0
? storePhotos(release, releaseEntry.rows[0]) : Promise.resolve(),
release.poster
? storePoster(release, releaseEntry.rows[0]) : Promise.resolve(),
release.trailer
? storeTrailer(release, releaseEntry.rows[0]) : Promise.resolve(),
]);
return;
}
console.error(`Unable to save scene to database, possible collision: "${release.title}" (${release.site.name})`);
2019-05-08 03:50:13 +00:00
}, {
concurrency: 2,
});
}
async function fetchNewReleases(scraper, site, afterDate, accReleases = [], page = 1) {
const latestReleases = await scraper.fetchLatest(site, page);
if (latestReleases.length === 0) {
return [];
}
const duplicateReleases = await findDuplicateReleases(latestReleases, site.id);
const duplicateReleasesIds = new Set(
duplicateReleases
.map(release => release.shoot_id || release.entry_id)
.concat(duplicateReleases.map(release => release.entry_id || release.shoot_id))
// exclude accumulated releases to prevent an infinite loop if the next page contains the same releases as the previous
.concat(accReleases.map(release => release.shootId || release.entryId)),
);
const uniqueReleases = latestReleases.filter(release => !duplicateReleasesIds.has(String(release.shootId))
&& !duplicateReleasesIds.has(String(release.entryId))
&& moment(release.date).isAfter(afterDate));
console.log(`\x1b[90m${site.name}: Scraped page ${page}, ${uniqueReleases.length} unique recent releases\x1b[0m`);
const oldestReleaseOnPage = latestReleases.slice(-1)[0].date;
if (uniqueReleases.length > 0 && moment(oldestReleaseOnPage).isAfter(afterDate)) {
return fetchNewReleases(scraper, site, afterDate, accReleases.concat(uniqueReleases), page + 1);
}
return accReleases.concat(uniqueReleases);
}
async function fetchReleases() {
const sites = await accumulateIncludedSites();
if (sites.length === 0) {
console.error('None of the specified sites are in the database');
return [];
}
2019-05-08 03:50:13 +00:00
const scenesPerSite = await Promise.map(sites, async (site) => {
2019-09-10 14:48:04 +00:00
const scraper = scrapers[site.slug] || scrapers[site.network.slug];
if (scraper) {
try {
const afterDate = moment.utc().subtract(...argv.after.split(' ')).toDate();
const [newReleases, upcomingReleases] = await Promise.all([
fetchNewReleases(scraper, site, afterDate),
scraper.fetchUpcoming ? await scraper.fetchUpcoming(site) : [],
]);
console.log(`${site.name}: Found ${newReleases.length} recent releases, ${upcomingReleases.length} upcoming releases`);
const finalReleases = argv.deep
? await Promise.map(newReleases, async (release) => {
if (release.url) {
const scene = await fetchScene(release.url, release);
return {
...release,
...scene,
};
}
return release;
}, {
concurrency: 2,
})
: newReleases;
if (argv.save) {
await storeReleases(finalReleases);
}
return [
...finalReleases.map(release => ({
...release,
network: site.network,
})),
...upcomingReleases.map(release => ({
...release,
network: site.network,
upcoming: true,
})),
];
} catch (error) {
if (argv.debug) {
console.error(`${site.id}: Failed to fetch releases`, error);
return [];
}
console.log(`${site.id}: Failed to fetch releases`);
return [];
}
}
console.error(`Cound not find scraper for '${site.name}' (${site.slug})`);
return [];
2019-05-08 03:50:13 +00:00
}, {
concurrency: 2,
});
const accumulatedScenes = scenesPerSite.reduce((acc, siteScenes) => ([...acc, ...siteScenes]), []);
const sortedScenes = accumulatedScenes.sort(({ date: dateA }, { date: dateB }) => moment(dateB).diff(dateA));
knex.destroy();
return sortedScenes;
}
module.exports = fetchReleases;