traxxx/src/fetch-releases.js

169 lines
5.7 KiB
JavaScript
Raw Normal View History

'use strict';
const config = require('config');
const moment = require('moment');
const argv = require('./argv');
const knex = require('./knex');
const scrapers = require('./scrapers');
function destructConfigNetworks(networks) {
return networks.reduce((acc, network) => {
if (Array.isArray(network)) {
// network specifies sites
return {
...acc,
sites: [...acc.sites, ...network[1]],
};
}
return {
...acc,
networks: [...acc.networks, network],
};
}, {
networks: [],
sites: [],
});
}
function curateSites(sites) {
return sites.map(site => ({
id: site.id,
name: site.name,
description: site.description,
url: site.url,
networkId: site.network_id,
parameters: JSON.parse(site.parameters),
}));
}
async function accumulateIncludedSites() {
if (argv.networks || argv.sites) {
const rawSites = await knex('sites')
.whereIn('id', argv.sites || [])
.orWhereIn('network_id', argv.networks || []);
return curateSites(rawSites);
}
const included = destructConfigNetworks(config.include);
const rawSites = await knex('sites')
.whereIn('id', included.sites)
.orWhereIn('network_id', included.networks);
return curateSites(rawSites);
}
async function findDuplicateReleases(latestReleases, _siteId) {
const latestReleasesShootIds = latestReleases.map(release => release.shootId).filter(release => release !== undefined);
const latestReleasesEntryIds = latestReleases.map(release => release.entryId).filter(release => release !== undefined);
return knex('releases')
.whereIn('shoot_id', latestReleasesShootIds)
.orWhereIn('entry_id', latestReleasesEntryIds);
}
async function storeReleases(releases) {
const curatedReleases = releases.map(release => ({
site_id: release.site.id,
shoot_id: release.shootId || null,
entry_id: release.entryId || null,
url: release.url,
title: release.title,
date: release.date,
description: release.description,
director: release.director,
duration: release.duration,
likes: release.rating && release.rating.likes,
dislikes: release.rating && release.rating.dislikes,
rating: release.rating && release.rating.stars,
}));
if (curatedReleases.length) {
console.log(`Saving ${curatedReleases.length} new releases to database`);
const insertQuery = knex('releases').insert(curatedReleases).toString();
await knex.raw(insertQuery.replace('insert', 'INSERT OR IGNORE'));
return curatedReleases;
}
return [];
}
async function fetchNewReleases(scraper, site, afterDate, accReleases = [], page = 1) {
const latestReleases = await scraper.fetchLatest(site, page);
if (latestReleases.length === 0) {
return [];
}
const duplicateReleases = await findDuplicateReleases(latestReleases, site.id);
const duplicateReleasesIds = new Set(
duplicateReleases
.map(release => release.shoot_id || release.entry_id)
// exclude accumulated releases to prevent an infinite loop if the next page contains the same releases as the previous
.concat(duplicateReleases.map(release => release.shoot_id || release.entry_id))
.concat(accReleases.map(release => release.shootId || release.entryId)),
);
const uniqueReleases = latestReleases.filter(release => !duplicateReleasesIds.has(String(release.shootId))
&& !duplicateReleasesIds.has(String(release.entryId))
&& moment(release.date).isAfter(afterDate));
console.log(`\x1b[90m${site.name}: Scraped page ${page}, ${uniqueReleases.length} unique recent releases\x1b[0m`);
const oldestReleaseOnPage = latestReleases.slice(-1)[0].date;
if (uniqueReleases.length > 0 && moment(oldestReleaseOnPage).isAfter(afterDate)) {
return fetchNewReleases(scraper, site, afterDate, accReleases.concat(uniqueReleases), page + 1);
}
return accReleases.concat(uniqueReleases);
}
async function fetchReleases() {
const sites = await accumulateIncludedSites();
const scenesPerSite = await Promise.all(sites.map(async (site) => {
const scraper = scrapers[site.id] || scrapers[site.networkId];
if (scraper) {
try {
const afterDate = moment.utc().subtract(...argv.after.split(' ')).toDate();
const [newReleases, upcomingReleases] = await Promise.all([
fetchNewReleases(scraper, site, afterDate),
scraper.fetchUpcoming ? await scraper.fetchUpcoming(site) : [],
]);
console.log(`${site.name}: Found ${newReleases.length} recent releases, ${upcomingReleases.length} upcoming releases`);
if (argv.save) {
await storeReleases(newReleases);
}
return [...newReleases, ...upcomingReleases.map(release => ({ ...release, upcoming: true }))];
} catch (error) {
if (argv.debug) {
console.error(`${site.id}: Failed to fetch releases`, error);
return [];
}
console.log(`${site.id}: Failed to fetch releases`);
return [];
}
}
return [];
}));
const accumulatedScenes = scenesPerSite.reduce((acc, siteScenes) => ([...acc, ...siteScenes]), []);
const sortedScenes = accumulatedScenes.sort(({ date: dateA }, { date: dateB }) => moment(dateB).diff(dateA));
return sortedScenes;
}
module.exports = fetchReleases;