2019-03-23 21:48:39 +00:00
|
|
|
'use strict';
|
|
|
|
|
|
|
|
const config = require('config');
|
2019-05-08 03:50:13 +00:00
|
|
|
const fs = require('fs-extra');
|
|
|
|
const path = require('path');
|
2019-05-06 00:01:57 +00:00
|
|
|
const Promise = require('bluebird');
|
2019-03-23 21:48:39 +00:00
|
|
|
const moment = require('moment');
|
2019-09-25 02:52:58 +00:00
|
|
|
const mime = require('mime');
|
2019-05-08 03:50:13 +00:00
|
|
|
const bhttp = require('bhttp');
|
2019-03-23 21:48:39 +00:00
|
|
|
|
2019-04-04 02:00:28 +00:00
|
|
|
const argv = require('./argv');
|
2019-03-25 02:57:33 +00:00
|
|
|
const knex = require('./knex');
|
2019-03-23 21:48:39 +00:00
|
|
|
const scrapers = require('./scrapers');
|
2019-05-06 00:01:57 +00:00
|
|
|
const fetchScene = require('./fetch-scene');
|
2019-03-23 21:48:39 +00:00
|
|
|
|
2019-03-25 02:57:33 +00:00
|
|
|
function destructConfigNetworks(networks) {
|
|
|
|
return networks.reduce((acc, network) => {
|
2019-03-23 21:48:39 +00:00
|
|
|
if (Array.isArray(network)) {
|
2019-03-25 02:57:33 +00:00
|
|
|
// network specifies sites
|
|
|
|
return {
|
2019-03-23 21:48:39 +00:00
|
|
|
...acc,
|
2019-03-25 02:57:33 +00:00
|
|
|
sites: [...acc.sites, ...network[1]],
|
|
|
|
};
|
2019-03-23 21:48:39 +00:00
|
|
|
}
|
|
|
|
|
2019-03-25 02:57:33 +00:00
|
|
|
return {
|
2019-03-23 21:48:39 +00:00
|
|
|
...acc,
|
2019-03-25 02:57:33 +00:00
|
|
|
networks: [...acc.networks, network],
|
|
|
|
};
|
|
|
|
}, {
|
|
|
|
networks: [],
|
|
|
|
sites: [],
|
|
|
|
});
|
2019-03-23 21:48:39 +00:00
|
|
|
}
|
|
|
|
|
2019-03-25 02:57:33 +00:00
|
|
|
function curateSites(sites) {
|
|
|
|
return sites.map(site => ({
|
|
|
|
id: site.id,
|
|
|
|
name: site.name,
|
2019-09-10 14:48:04 +00:00
|
|
|
slug: site.slug,
|
2019-03-25 02:57:33 +00:00
|
|
|
description: site.description,
|
|
|
|
url: site.url,
|
2019-04-10 01:42:20 +00:00
|
|
|
network: {
|
|
|
|
id: site.network_id,
|
|
|
|
name: site.network_name,
|
2019-09-10 14:48:04 +00:00
|
|
|
slug: site.network_slug,
|
2019-04-10 01:42:20 +00:00
|
|
|
},
|
2019-03-26 00:26:47 +00:00
|
|
|
parameters: JSON.parse(site.parameters),
|
2019-03-25 02:57:33 +00:00
|
|
|
}));
|
|
|
|
}
|
2019-03-23 21:48:39 +00:00
|
|
|
|
2019-03-25 02:57:33 +00:00
|
|
|
async function accumulateIncludedSites() {
|
2019-04-04 02:00:28 +00:00
|
|
|
if (argv.networks || argv.sites) {
|
2019-09-10 14:48:04 +00:00
|
|
|
const networks = await knex('networks').select('id').whereIn('slug', argv.networks || []);
|
2019-09-08 02:06:49 +00:00
|
|
|
const networkIds = networks.map(network => network.id);
|
|
|
|
|
2019-04-04 02:00:28 +00:00
|
|
|
const rawSites = await knex('sites')
|
2019-09-10 14:48:04 +00:00
|
|
|
.select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug')
|
2019-09-08 02:06:49 +00:00
|
|
|
.whereIn('sites.slug', argv.sites || [])
|
|
|
|
.orWhereIn('network_id', networkIds)
|
2019-04-10 01:42:20 +00:00
|
|
|
.leftJoin('networks', 'sites.network_id', 'networks.id');
|
2019-04-04 02:00:28 +00:00
|
|
|
|
|
|
|
return curateSites(rawSites);
|
|
|
|
}
|
|
|
|
|
2019-03-25 02:57:33 +00:00
|
|
|
const included = destructConfigNetworks(config.include);
|
2019-03-23 21:48:39 +00:00
|
|
|
|
2019-09-10 14:48:04 +00:00
|
|
|
const networks = await knex('networks').select('id').whereIn('slug', included.networks || []);
|
2019-09-08 02:06:49 +00:00
|
|
|
const networkIds = networks.map(network => network.id);
|
|
|
|
|
2019-03-25 02:57:33 +00:00
|
|
|
const rawSites = await knex('sites')
|
2019-04-10 01:42:20 +00:00
|
|
|
.select('sites.*', 'networks.name as network_name')
|
2019-09-10 14:48:04 +00:00
|
|
|
.whereIn('sites.slug', included.sites || [])
|
2019-09-08 02:06:49 +00:00
|
|
|
.orWhereIn('network_id', networkIds)
|
2019-04-10 01:42:20 +00:00
|
|
|
.leftJoin('networks', 'sites.network_id', 'networks.id');
|
2019-03-23 21:48:39 +00:00
|
|
|
|
2019-03-25 02:57:33 +00:00
|
|
|
return curateSites(rawSites);
|
2019-03-23 21:48:39 +00:00
|
|
|
}
|
|
|
|
|
2019-04-06 21:24:26 +00:00
|
|
|
async function findDuplicateReleases(latestReleases, _siteId) {
|
|
|
|
const latestReleasesShootIds = latestReleases.map(release => release.shootId).filter(release => release !== undefined);
|
2019-04-07 00:15:57 +00:00
|
|
|
const latestReleasesEntryIds = latestReleases.map(release => release.entryId).filter(release => release !== undefined);
|
2019-04-05 01:45:40 +00:00
|
|
|
|
2019-04-04 02:00:28 +00:00
|
|
|
return knex('releases')
|
2019-04-06 21:24:26 +00:00
|
|
|
.whereIn('shoot_id', latestReleasesShootIds)
|
2019-04-07 00:15:57 +00:00
|
|
|
.orWhereIn('entry_id', latestReleasesEntryIds);
|
2019-04-01 00:45:15 +00:00
|
|
|
}
|
|
|
|
|
2019-09-26 01:27:01 +00:00
|
|
|
async function storeActors(release, releaseEntry) {
|
|
|
|
const actors = await knex('actors').whereIn('name', release.actors);
|
|
|
|
const newActors = release.actors.filter(actorName => !actors.some(actor => actor.name === actorName));
|
|
|
|
|
|
|
|
const { rows: insertedActors } = newActors.length
|
|
|
|
? await knex.raw(`${knex('actors').insert(newActors.map(actorName => ({
|
|
|
|
name: actorName,
|
|
|
|
slug: actorName.toLowerCase().replace(/\s+/g, '-'),
|
|
|
|
})))} ON CONFLICT DO NOTHING RETURNING *`)
|
|
|
|
: { rows: [] };
|
|
|
|
|
|
|
|
return knex('actors_associated').insert(actors.concat(insertedActors).map(actor => ({
|
|
|
|
release_id: releaseEntry.id,
|
|
|
|
actor_id: actor.id,
|
|
|
|
})), '*');
|
|
|
|
}
|
|
|
|
|
|
|
|
async function storeTags(release, releaseEntry) {
|
2019-10-27 23:58:54 +00:00
|
|
|
return knex('tags_associated').insert(release.tags.map(tagId => ({
|
|
|
|
tag_id: tagId,
|
2019-09-26 01:27:01 +00:00
|
|
|
release_id: releaseEntry.id,
|
|
|
|
})));
|
|
|
|
}
|
|
|
|
|
2019-09-08 01:53:09 +00:00
|
|
|
async function storePhotos(release, releaseEntry) {
|
2019-09-25 02:52:58 +00:00
|
|
|
console.log(`Storing ${release.photos.length} photos for (${release.site.name}, ${releaseEntry.id}) "${release.title}"`);
|
2019-09-08 01:53:09 +00:00
|
|
|
|
2019-09-25 02:52:58 +00:00
|
|
|
const files = await Promise.map(release.photos, async (photoUrl, index) => {
|
|
|
|
const { pathname } = new URL(photoUrl);
|
|
|
|
const mimetype = mime.getType(pathname);
|
2019-09-08 01:53:09 +00:00
|
|
|
|
|
|
|
const res = await bhttp.get(photoUrl);
|
2019-09-25 02:52:58 +00:00
|
|
|
const filepath = path.join(release.site.slug, releaseEntry.id.toString(), `${index + 1}.${mime.getExtension(mimetype)}`);
|
2019-09-10 14:48:04 +00:00
|
|
|
await fs.writeFile(path.join(config.photoPath, filepath), res.body);
|
2019-09-08 01:53:09 +00:00
|
|
|
|
2019-09-25 02:52:58 +00:00
|
|
|
return {
|
|
|
|
filepath,
|
|
|
|
mimetype,
|
|
|
|
};
|
2019-09-08 01:53:09 +00:00
|
|
|
}, {
|
|
|
|
concurrency: 2,
|
|
|
|
});
|
2019-09-10 14:48:04 +00:00
|
|
|
|
2019-09-25 02:52:58 +00:00
|
|
|
await knex('media').insert(files.map(({ filepath, mimetype }, index) => ({
|
|
|
|
path: filepath,
|
|
|
|
mime: mimetype,
|
2019-09-10 14:48:04 +00:00
|
|
|
index,
|
|
|
|
domain: 'releases',
|
|
|
|
target_id: releaseEntry.id,
|
2019-09-25 02:52:58 +00:00
|
|
|
role: 'photo',
|
2019-09-10 14:48:04 +00:00
|
|
|
})));
|
2019-09-25 02:52:58 +00:00
|
|
|
}
|
2019-09-10 14:48:04 +00:00
|
|
|
|
2019-09-25 02:52:58 +00:00
|
|
|
async function storePoster(release, releaseEntry) {
|
|
|
|
console.log(`Storing poster for (${release.site.name}, ${releaseEntry.id}) "${release.title}"`);
|
2019-09-10 14:48:04 +00:00
|
|
|
|
2019-09-25 02:52:58 +00:00
|
|
|
const { pathname } = new URL(release.poster);
|
|
|
|
const mimetype = mime.getType(pathname);
|
|
|
|
|
|
|
|
const res = await bhttp.get(release.poster);
|
|
|
|
const filepath = path.join(release.site.slug, releaseEntry.id.toString(), `poster.${mime.getExtension(mimetype)}`);
|
|
|
|
await fs.writeFile(path.join(config.photoPath, filepath), res.body);
|
|
|
|
|
|
|
|
await knex('media').insert({
|
|
|
|
path: filepath,
|
|
|
|
mime: mimetype,
|
|
|
|
domain: 'releases',
|
|
|
|
target_id: releaseEntry.id,
|
|
|
|
role: 'poster',
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
async function storeTrailer(release, releaseEntry) {
|
|
|
|
console.log(`Storing trailer for (${release.site.name}, ${releaseEntry.id}) "${release.title}"`);
|
|
|
|
|
|
|
|
const { pathname } = new URL(release.trailer.src);
|
|
|
|
const mimetype = release.trailer.type || mime.getType(pathname);
|
|
|
|
|
|
|
|
const res = await bhttp.get(release.trailer.src);
|
|
|
|
const filepath = path.join(release.site.slug, releaseEntry.id.toString(), `trailer${release.trailer.quality ? `_${release.trailer.quality}` : ''}.${mime.getExtension(mimetype)}`);
|
|
|
|
await fs.writeFile(path.join(config.photoPath, filepath), res.body);
|
|
|
|
|
|
|
|
await knex('media').insert({
|
|
|
|
path: filepath,
|
|
|
|
mime: mimetype,
|
|
|
|
domain: 'releases',
|
|
|
|
target_id: releaseEntry.id,
|
|
|
|
role: 'trailer',
|
|
|
|
quality: release.trailer.quality || null,
|
|
|
|
});
|
2019-09-08 01:53:09 +00:00
|
|
|
}
|
|
|
|
|
2019-05-06 00:01:57 +00:00
|
|
|
async function storeReleases(releases = []) {
|
2019-05-08 03:50:13 +00:00
|
|
|
return Promise.map(releases, async (release) => {
|
2019-05-06 00:01:57 +00:00
|
|
|
const curatedRelease = {
|
|
|
|
site_id: release.site.id,
|
|
|
|
shoot_id: release.shootId || null,
|
|
|
|
entry_id: release.entryId || null,
|
|
|
|
url: release.url,
|
|
|
|
title: release.title,
|
|
|
|
date: release.date,
|
|
|
|
description: release.description,
|
2019-07-06 03:29:12 +00:00
|
|
|
// director: release.director,
|
2019-05-06 00:01:57 +00:00
|
|
|
duration: release.duration,
|
2019-09-10 14:48:04 +00:00
|
|
|
// photos: release.photos ? release.photos.length : 0,
|
2019-05-06 00:01:57 +00:00
|
|
|
likes: release.rating && release.rating.likes,
|
|
|
|
dislikes: release.rating && release.rating.dislikes,
|
|
|
|
rating: release.rating && release.rating.stars,
|
2019-09-26 01:27:01 +00:00
|
|
|
deep: argv.deep,
|
2019-05-06 00:01:57 +00:00
|
|
|
};
|
2019-04-01 00:45:15 +00:00
|
|
|
|
2019-05-06 00:01:57 +00:00
|
|
|
const releaseQuery = `${knex('releases').insert(curatedRelease).toString()} ON CONFLICT DO NOTHING RETURNING *`;
|
|
|
|
const releaseEntry = await knex.raw(releaseQuery);
|
2019-04-01 00:45:15 +00:00
|
|
|
|
2019-09-26 01:27:01 +00:00
|
|
|
if (releaseEntry.rows.length > 0) {
|
|
|
|
console.log(`Stored (${release.site.name}, ${releaseEntry.rows[0].id}) "${release.title}"`);
|
2019-09-25 02:52:58 +00:00
|
|
|
|
2019-09-26 01:27:01 +00:00
|
|
|
if (release.poster || (release.photos && release.photos.length)) {
|
|
|
|
await fs.mkdir(path.join(config.photoPath, release.site.slug, releaseEntry.rows[0].id.toString()), { recursive: true });
|
|
|
|
}
|
2019-09-25 02:52:58 +00:00
|
|
|
|
2019-09-26 01:27:01 +00:00
|
|
|
await Promise.all([
|
|
|
|
release.actors && release.actors.length > 0
|
|
|
|
? storeActors(release, releaseEntry.rows[0]) : Promise.resolve(),
|
|
|
|
release.tags && release.tags.length > 0
|
|
|
|
? storeTags(release, releaseEntry.rows[0]) : Promise.resolve(),
|
|
|
|
release.photos && release.photos.length > 0
|
|
|
|
? storePhotos(release, releaseEntry.rows[0]) : Promise.resolve(),
|
|
|
|
release.poster
|
|
|
|
? storePoster(release, releaseEntry.rows[0]) : Promise.resolve(),
|
|
|
|
release.trailer
|
|
|
|
? storeTrailer(release, releaseEntry.rows[0]) : Promise.resolve(),
|
|
|
|
]);
|
|
|
|
|
|
|
|
return;
|
2019-09-25 02:52:58 +00:00
|
|
|
}
|
|
|
|
|
2019-09-26 01:27:01 +00:00
|
|
|
console.error(`Unable to save scene to database, possible collision: "${release.title}" (${release.site.name})`);
|
2019-05-08 03:50:13 +00:00
|
|
|
}, {
|
|
|
|
concurrency: 2,
|
|
|
|
});
|
2019-04-01 00:45:15 +00:00
|
|
|
}
|
|
|
|
|
2019-04-05 01:45:40 +00:00
|
|
|
async function fetchNewReleases(scraper, site, afterDate, accReleases = [], page = 1) {
|
|
|
|
const latestReleases = await scraper.fetchLatest(site, page);
|
|
|
|
|
2019-04-07 18:51:14 +00:00
|
|
|
if (latestReleases.length === 0) {
|
|
|
|
return [];
|
|
|
|
}
|
|
|
|
|
2019-04-06 21:24:26 +00:00
|
|
|
const duplicateReleases = await findDuplicateReleases(latestReleases, site.id);
|
2019-09-26 01:27:01 +00:00
|
|
|
|
2019-04-07 00:15:57 +00:00
|
|
|
const duplicateReleasesIds = new Set(
|
2019-04-05 01:45:40 +00:00
|
|
|
duplicateReleases
|
2019-04-07 23:49:45 +00:00
|
|
|
.map(release => release.shoot_id || release.entry_id)
|
2019-09-26 01:27:01 +00:00
|
|
|
.concat(duplicateReleases.map(release => release.entry_id || release.shoot_id))
|
2019-04-05 01:45:40 +00:00
|
|
|
// exclude accumulated releases to prevent an infinite loop if the next page contains the same releases as the previous
|
2019-04-07 23:49:45 +00:00
|
|
|
.concat(accReleases.map(release => release.shootId || release.entryId)),
|
2019-04-05 01:45:40 +00:00
|
|
|
);
|
2019-09-26 01:27:01 +00:00
|
|
|
|
2019-04-07 00:15:57 +00:00
|
|
|
const uniqueReleases = latestReleases.filter(release => !duplicateReleasesIds.has(String(release.shootId))
|
|
|
|
&& !duplicateReleasesIds.has(String(release.entryId))
|
|
|
|
&& moment(release.date).isAfter(afterDate));
|
2019-04-05 01:45:40 +00:00
|
|
|
|
2019-04-07 23:49:45 +00:00
|
|
|
console.log(`\x1b[90m${site.name}: Scraped page ${page}, ${uniqueReleases.length} unique recent releases\x1b[0m`);
|
2019-04-05 01:45:40 +00:00
|
|
|
|
|
|
|
const oldestReleaseOnPage = latestReleases.slice(-1)[0].date;
|
|
|
|
|
|
|
|
if (uniqueReleases.length > 0 && moment(oldestReleaseOnPage).isAfter(afterDate)) {
|
|
|
|
return fetchNewReleases(scraper, site, afterDate, accReleases.concat(uniqueReleases), page + 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
return accReleases.concat(uniqueReleases);
|
|
|
|
}
|
|
|
|
|
2019-03-23 21:48:39 +00:00
|
|
|
async function fetchReleases() {
|
2019-03-25 02:57:33 +00:00
|
|
|
const sites = await accumulateIncludedSites();
|
2019-03-23 21:48:39 +00:00
|
|
|
|
2019-10-27 23:58:54 +00:00
|
|
|
if (sites.length === 0) {
|
|
|
|
console.error('None of the specified sites are in the database');
|
|
|
|
return [];
|
|
|
|
}
|
|
|
|
|
2019-05-08 03:50:13 +00:00
|
|
|
const scenesPerSite = await Promise.map(sites, async (site) => {
|
2019-09-10 14:48:04 +00:00
|
|
|
const scraper = scrapers[site.slug] || scrapers[site.network.slug];
|
2019-03-23 21:48:39 +00:00
|
|
|
|
|
|
|
if (scraper) {
|
2019-04-05 01:45:40 +00:00
|
|
|
try {
|
|
|
|
const afterDate = moment.utc().subtract(...argv.after.split(' ')).toDate();
|
|
|
|
|
|
|
|
const [newReleases, upcomingReleases] = await Promise.all([
|
|
|
|
fetchNewReleases(scraper, site, afterDate),
|
|
|
|
scraper.fetchUpcoming ? await scraper.fetchUpcoming(site) : [],
|
|
|
|
]);
|
|
|
|
|
2019-04-07 23:49:45 +00:00
|
|
|
console.log(`${site.name}: Found ${newReleases.length} recent releases, ${upcomingReleases.length} upcoming releases`);
|
2019-04-05 01:45:40 +00:00
|
|
|
|
2019-09-26 01:27:01 +00:00
|
|
|
const finalReleases = argv.deep
|
|
|
|
? await Promise.map(newReleases, async (release) => {
|
|
|
|
if (release.url) {
|
|
|
|
const scene = await fetchScene(release.url, release);
|
|
|
|
|
|
|
|
return {
|
|
|
|
...release,
|
|
|
|
...scene,
|
|
|
|
};
|
|
|
|
}
|
2019-05-06 00:01:57 +00:00
|
|
|
|
2019-09-26 01:27:01 +00:00
|
|
|
return release;
|
|
|
|
}, {
|
|
|
|
concurrency: 2,
|
|
|
|
})
|
|
|
|
: newReleases;
|
|
|
|
|
|
|
|
if (argv.save) {
|
2019-05-06 00:01:57 +00:00
|
|
|
await storeReleases(finalReleases);
|
2019-04-05 01:45:40 +00:00
|
|
|
}
|
|
|
|
|
2019-04-10 01:42:20 +00:00
|
|
|
return [
|
2019-09-26 01:27:01 +00:00
|
|
|
...finalReleases.map(release => ({
|
2019-04-10 01:42:20 +00:00
|
|
|
...release,
|
|
|
|
network: site.network,
|
|
|
|
})),
|
|
|
|
...upcomingReleases.map(release => ({
|
|
|
|
...release,
|
|
|
|
network: site.network,
|
|
|
|
upcoming: true,
|
|
|
|
})),
|
|
|
|
];
|
2019-04-05 01:45:40 +00:00
|
|
|
} catch (error) {
|
|
|
|
if (argv.debug) {
|
|
|
|
console.error(`${site.id}: Failed to fetch releases`, error);
|
|
|
|
return [];
|
|
|
|
}
|
|
|
|
|
|
|
|
console.log(`${site.id}: Failed to fetch releases`);
|
2019-09-28 01:26:04 +00:00
|
|
|
|
2019-04-05 01:45:40 +00:00
|
|
|
return [];
|
|
|
|
}
|
2019-03-23 21:48:39 +00:00
|
|
|
}
|
|
|
|
|
2019-09-28 01:26:04 +00:00
|
|
|
console.error(`Cound not find scraper for '${site.name}' (${site.slug})`);
|
|
|
|
|
2019-03-23 21:48:39 +00:00
|
|
|
return [];
|
2019-05-08 03:50:13 +00:00
|
|
|
}, {
|
|
|
|
concurrency: 2,
|
|
|
|
});
|
2019-03-23 21:48:39 +00:00
|
|
|
|
|
|
|
const accumulatedScenes = scenesPerSite.reduce((acc, siteScenes) => ([...acc, ...siteScenes]), []);
|
|
|
|
const sortedScenes = accumulatedScenes.sort(({ date: dateA }, { date: dateB }) => moment(dateB).diff(dateA));
|
|
|
|
|
2019-10-28 01:54:37 +00:00
|
|
|
knex.destroy();
|
|
|
|
|
2019-03-23 21:48:39 +00:00
|
|
|
return sortedScenes;
|
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = fetchReleases;
|