'use strict'; const config = require('config'); const fs = require('fs-extra'); const path = require('path'); const Promise = require('bluebird'); const moment = require('moment'); const bhttp = require('bhttp'); const argv = require('./argv'); const knex = require('./knex'); const scrapers = require('./scrapers'); const fetchScene = require('./fetch-scene'); function destructConfigNetworks(networks) { return networks.reduce((acc, network) => { if (Array.isArray(network)) { // network specifies sites return { ...acc, sites: [...acc.sites, ...network[1]], }; } return { ...acc, networks: [...acc.networks, network], }; }, { networks: [], sites: [], }); } function curateSites(sites) { return sites.map(site => ({ id: site.id, name: site.name, description: site.description, url: site.url, network: { id: site.network_id, name: site.network_name, }, parameters: JSON.parse(site.parameters), })); } async function accumulateIncludedSites() { if (argv.networks || argv.sites) { const rawSites = await knex('sites') .select('sites.*', 'networks.name as network_name') .whereIn('sites.id', argv.sites || []) .orWhereIn('network_id', argv.networks || []) .leftJoin('networks', 'sites.network_id', 'networks.id'); return curateSites(rawSites); } const included = destructConfigNetworks(config.include); const rawSites = await knex('sites') .select('sites.*', 'networks.name as network_name') .whereIn('sites.id', included.sites) .orWhereIn('network_id', included.networks) .leftJoin('networks', 'sites.network_id', 'networks.id'); return curateSites(rawSites); } async function findDuplicateReleases(latestReleases, _siteId) { const latestReleasesShootIds = latestReleases.map(release => release.shootId).filter(release => release !== undefined); const latestReleasesEntryIds = latestReleases.map(release => release.entryId).filter(release => release !== undefined); return knex('releases') .whereIn('shoot_id', latestReleasesShootIds) .orWhereIn('entry_id', latestReleasesEntryIds); } async function storeReleases(releases = []) { return Promise.map(releases, async (release) => { const curatedRelease = { site_id: release.site.id, shoot_id: release.shootId || null, entry_id: release.entryId || null, url: release.url, title: release.title, date: release.date, description: release.description, director: release.director, duration: release.duration, likes: release.rating && release.rating.likes, dislikes: release.rating && release.rating.dislikes, rating: release.rating && release.rating.stars, }; const releaseQuery = `${knex('releases').insert(curatedRelease).toString()} ON CONFLICT DO NOTHING RETURNING *`; const releaseEntry = await knex.raw(releaseQuery); if (release.actors && release.actors.length > 0) { const actors = await knex('actors').whereIn('name', release.actors); const newActors = release.actors.filter(actorName => !actors.some(actor => actor.name === actorName)); const { rows: insertedActors } = newActors.length ? await knex.raw(`${knex('actors').insert(newActors.map(actorName => ({ name: actorName })))} ON CONFLICT DO NOTHING RETURNING *`) : { rows: [] }; await knex('actors_associated').insert(actors.concat(insertedActors).map(actor => ({ release_id: releaseEntry.rows[0].id, actor_id: actor.id, })), '*'); } if (release.tags && release.tags.length > 0) { await knex('tags_associated').insert(release.tags.map(tag => ({ tag_id: tag, release_id: releaseEntry.rows[0].id, }))); } if (release.thumbnails && release.thumbnails.length > 0) { const thumbnailPath = path.join(config.thumbnailPath, release.site.id, releaseEntry.rows[0].id.toString()); await fs.mkdir(thumbnailPath, { recursive: true }); await Promise.map(release.thumbnails, async (thumbnailUrl, index) => { const res = await bhttp.get(thumbnailUrl); await fs.writeFile(path.join(thumbnailPath, `${index}.jpg`), res.body); }, { concurrency: 2, }); } }, { concurrency: 2, }); } async function fetchNewReleases(scraper, site, afterDate, accReleases = [], page = 1) { const latestReleases = await scraper.fetchLatest(site, page); if (latestReleases.length === 0) { return []; } const duplicateReleases = await findDuplicateReleases(latestReleases, site.id); const duplicateReleasesIds = new Set( duplicateReleases .map(release => release.shoot_id || release.entry_id) // exclude accumulated releases to prevent an infinite loop if the next page contains the same releases as the previous .concat(duplicateReleases.map(release => release.shoot_id || release.entry_id)) .concat(accReleases.map(release => release.shootId || release.entryId)), ); const uniqueReleases = latestReleases.filter(release => !duplicateReleasesIds.has(String(release.shootId)) && !duplicateReleasesIds.has(String(release.entryId)) && moment(release.date).isAfter(afterDate)); console.log(`\x1b[90m${site.name}: Scraped page ${page}, ${uniqueReleases.length} unique recent releases\x1b[0m`); const oldestReleaseOnPage = latestReleases.slice(-1)[0].date; if (uniqueReleases.length > 0 && moment(oldestReleaseOnPage).isAfter(afterDate)) { return fetchNewReleases(scraper, site, afterDate, accReleases.concat(uniqueReleases), page + 1); } return accReleases.concat(uniqueReleases); } async function fetchReleases() { const sites = await accumulateIncludedSites(); const scenesPerSite = await Promise.map(sites, async (site) => { const scraper = scrapers[site.id] || scrapers[site.network.id]; if (scraper) { try { const afterDate = moment.utc().subtract(...argv.after.split(' ')).toDate(); const [newReleases, upcomingReleases] = await Promise.all([ fetchNewReleases(scraper, site, afterDate), scraper.fetchUpcoming ? await scraper.fetchUpcoming(site) : [], ]); console.log(`${site.name}: Found ${newReleases.length} recent releases, ${upcomingReleases.length} upcoming releases`); if (argv.save) { const finalReleases = argv.deep ? await Promise.map(newReleases, async (release) => { if (release.url) { const scene = await fetchScene(release.url, release); return { ...release, ...scene, }; } return release; }, { concurrency: 2, }) : newReleases; await storeReleases(finalReleases); } return [ ...newReleases.map(release => ({ ...release, network: site.network, })), ...upcomingReleases.map(release => ({ ...release, network: site.network, upcoming: true, })), ]; } catch (error) { if (argv.debug) { console.error(`${site.id}: Failed to fetch releases`, error); return []; } console.log(`${site.id}: Failed to fetch releases`); return []; } } return []; }, { concurrency: 2, }); const accumulatedScenes = scenesPerSite.reduce((acc, siteScenes) => ([...acc, ...siteScenes]), []); const sortedScenes = accumulatedScenes.sort(({ date: dateA }, { date: dateB }) => moment(dateB).diff(dateA)); return sortedScenes; } module.exports = fetchReleases;