diff --git a/assets/components/home/home.vue b/assets/components/home/home.vue index 2f54247d..1c145b3b 100644 --- a/assets/components/home/home.vue +++ b/assets/components/home/home.vue @@ -16,7 +16,7 @@ import FilterBar from '../header/filter-bar.vue'; import Releases from '../releases/releases.vue'; async function fetchReleases() { - this.releases = await this.$store.dispatch('fetchReleases', { limit: 100 }); + this.releases = await this.$store.dispatch('fetchReleases', { limit: 250 }); this.$store.commit('setCache', { target: 'home', releases: this.releases, diff --git a/assets/js/networks/actions.js b/assets/js/networks/actions.js index 7748a950..b78ed497 100644 --- a/assets/js/networks/actions.js +++ b/assets/js/networks/actions.js @@ -76,10 +76,13 @@ function initNetworksActions(store, _router) { { date: { lessThan: $before, - greaterThan: $after, + greaterThan: $after } }, { + date: { + isNull: true + }, createdAt: { lessThan: $beforeTime, greaterThan: $afterTime, diff --git a/package-lock.json b/package-lock.json index f598e8ed..ccfee71d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -4955,6 +4955,11 @@ "elliptic": "^6.0.0" } }, + "create-error": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/create-error/-/create-error-0.3.1.tgz", + "integrity": "sha1-aYECRaYp5lRDK/BDdzYAA6U1GiM=" + }, "create-hash": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/create-hash/-/create-hash-1.2.0.tgz", @@ -10899,6 +10904,24 @@ "integrity": "sha1-mEcocL8igTL8vdhoEputEsPAKeM=", "dev": true }, + "promise-task-queue": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/promise-task-queue/-/promise-task-queue-1.2.0.tgz", + "integrity": "sha1-fz2NszEFNRuq/CqsFDRtcXwfHjs=", + "requires": { + "bluebird": "^3.3.3", + "create-error": "^0.3.1", + "debug": "^2.2.0", + "extend": "^3.0.0" + }, + "dependencies": { + "extend": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", + "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==" + } + } + }, "prop-types": { "version": "15.7.2", "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.7.2.tgz", diff --git a/package.json b/package.json index fa5708e5..cb71fe22 100644 --- a/package.json +++ b/package.json @@ -96,6 +96,7 @@ "pg": "^7.18.1", "postgraphile": "^4.5.5", "postgraphile-plugin-connection-filter": "^1.1.3", + "promise-task-queue": "^1.2.0", "prop-types": "^15.7.2", "react": "^16.12.0", "react-dom": "^16.12.0", diff --git a/seeds/00_tags.js b/seeds/00_tags.js index 4d850d09..6a9b9aae 100644 --- a/seeds/00_tags.js +++ b/seeds/00_tags.js @@ -748,14 +748,14 @@ const tags = [ priority: 10, }, { - name: 'double anal TP', + name: 'DA triple penetration', slug: 'da-tp', priority: 7, description: 'Triple penetration with two cocks in the ass, and one in the pussy. Also see [double vaginal TP](/tag/dv-tp).', group: 'penetration', }, { - name: 'double vaginal TP', + name: 'DV triple penetration', slug: 'dv-tp', priority: 7, description: 'Triple penetration with two cocks in the pussy, and one in the ass. Also see [double anal TP](/tag/da-tp).', diff --git a/src/argv.js b/src/argv.js index 975d67af..1ad12e96 100644 --- a/src/argv.js +++ b/src/argv.js @@ -101,6 +101,41 @@ const { argv } = yargs type: 'boolean', default: true, }) + .option('media', { + describe: 'Include any release media', + type: 'boolean', + default: true, + }) + .option('posters', { + describe: 'Include release posters', + type: 'boolean', + default: true, + }) + .option('covers', { + describe: 'Include release covers', + type: 'boolean', + default: true, + }) + .option('photos', { + describe: 'Include release photos', + type: 'boolean', + default: true, + }) + .option('trailers', { + describe: 'Include release trailers', + type: 'boolean', + default: true, + }) + .option('teasers', { + describe: 'Include release teasers', + type: 'boolean', + default: true, + }) + .option('avatars', { + describe: 'Include actor avatars', + type: 'boolean', + default: true, + }) .option('inspect', { describe: 'Show data in console.', type: 'boolean', diff --git a/src/media.js b/src/media.js index af69f04a..83ca4da0 100644 --- a/src/media.js +++ b/src/media.js @@ -2,7 +2,7 @@ const config = require('config'); const Promise = require('bluebird'); -const bhttp = require('bhttp'); +// const bhttp = require('bhttp'); const mime = require('mime'); const fs = require('fs-extra'); const sharp = require('sharp'); @@ -11,7 +11,9 @@ const blake2 = require('blake2'); const logger = require('./logger')(__filename); const knex = require('./knex'); +const { get } = require('./utils/http'); const { ex } = require('./utils/q'); +const chunk = require('./utils/chunk'); function getHash(buffer) { const hash = blake2.createHash('blake2b', { digestLength: 24 }); @@ -73,7 +75,8 @@ async function getEntropy(buffer) { } async function extractItem(source) { - const res = await bhttp.get(source.src); + // const res = await bhttp.get(source.src); + const res = await get(source.src); if (res.statusCode === 200) { const { q } = ex(res.body.toString()); @@ -114,7 +117,8 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att logger.verbose(`Fetching media item from ${source.src || source}`); - const res = await bhttp.get(source.src || source); + // const res = await bhttp.get(source.src || source); + const res = await get(source.src || source); if (res.statusCode === 200) { const { pathname } = new URL(source.src || source); @@ -150,9 +154,7 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att } async function fetchItems(itemSources, existingItemsBySource, domain, role) { - return Promise.map(itemSources, async (source, index) => fetchItem(source, index, existingItemsBySource, domain, role), { - concurrency: 10, - }).filter(Boolean); + return Promise.map(itemSources, async (source, index) => fetchItem(source, index, existingItemsBySource, domain, role)).filter(Boolean); } async function saveItems(items, domain, role) { @@ -182,10 +184,15 @@ async function saveItems(items, domain, role) { logger.verbose(`Saved ${domain} ${role} with thumbnail to ${filepath}`); return { - ...item, thumbnail, filepath, thumbpath, + mimetype: item.mimetype, + extension: item.extension, + hash: item.hash, + entropy: item.entropy, + quality: item.quality, + source: item.source, }; } @@ -199,8 +206,6 @@ async function saveItems(items, domain, role) { logger.error(`Failed to store ${domain} ${role} from ${item.source}: ${error.message}`); return null; } - }, { - concurrency: 20, }); } @@ -227,50 +232,55 @@ function groupItems(items) { } async function storeMedia(sources, domain, role) { - try { - const presentSources = sources.filter(Boolean); + const presentSources = sources.filter(Boolean); - if (presentSources.length === 0) { - return {}; - } - - // find source duplicates that don't need to be re-downloaded or re-saved - const existingSourceItems = await knex('media').whereIn('source', presentSources.flat().map(source => source.src || source)); - const { source: existingSourceItemsBySource, hash: existingSourceItemsByHash } = groupItems(existingSourceItems); - - // download media items from new sources - const fetchedItems = await fetchItems(presentSources, existingSourceItemsBySource, domain, role); - const { hash: fetchedItemsByHash } = groupItems(fetchedItems); - - // find hash duplicates that don't need to be re-saved - const uniqueFetchedItems = Object.values(fetchedItemsByHash); - const existingHashItems = await knex('media').whereIn('hash', uniqueFetchedItems.map(item => item.hash)); - const { hash: existingHashItemsByHash } = groupItems(existingHashItems); - - // save new items to disk - const newItems = uniqueFetchedItems.filter(item => !existingHashItemsByHash[item.hash]); - const savedItems = await saveItems(newItems, domain, role); - - // store new items in database - const curatedItemEntries = curateItemEntries(savedItems); - const storedItems = await knex('media').insert(curatedItemEntries).returning('*'); - const { hash: storedItemsByHash } = groupItems(Array.isArray(storedItems) ? storedItems : []); - - // accumulate existing and new items by source to be mapped onto releases - const itemsByHash = { ...existingSourceItemsByHash, ...existingHashItemsByHash, ...storedItemsByHash }; - const itemsBySource = { - ...existingSourceItemsBySource, - ...fetchedItems.reduce((acc, item) => ({ ...acc, [item.source]: itemsByHash[item.hash] }), {}), - }; - - logger.info(`Stored ${fetchedItems.length} new ${domain} ${role}s`); - - return itemsBySource; - } catch (error) { - logger.error(`Failed to store ${domain} ${role} batch: ${error.message}`); - - return null; + if (presentSources.length === 0) { + return {}; } + + // split up source list to prevent excessive RAM usage + const itemChunksBySource = await Promise.all(chunk(presentSources, 50).map(async (sourceChunk, index) => { + try { + // find source duplicates that don't need to be re-downloaded or re-saved + const existingSourceItems = await knex('media').whereIn('source', sourceChunk.flat().map(source => source.src || source)); + const { source: existingSourceItemsBySource, hash: existingSourceItemsByHash } = groupItems(existingSourceItems); + + // download media items from new sources + const fetchedItems = await fetchItems(sourceChunk, existingSourceItemsBySource, domain, role); + const { hash: fetchedItemsByHash } = groupItems(fetchedItems); + + // find hash duplicates that don't need to be re-saved + const uniqueFetchedItems = Object.values(fetchedItemsByHash); + const existingHashItems = await knex('media').whereIn('hash', uniqueFetchedItems.map(item => item.hash)); + const { hash: existingHashItemsByHash } = groupItems(existingHashItems); + + // save new items to disk + const newItems = uniqueFetchedItems.filter(item => !existingHashItemsByHash[item.hash]); + const savedItems = await saveItems(newItems, domain, role); + + // store new items in database + const curatedItemEntries = curateItemEntries(savedItems); + const storedItems = await knex('media').insert(curatedItemEntries).returning('*'); + const { hash: storedItemsByHash } = groupItems(Array.isArray(storedItems) ? storedItems : []); + + // accumulate existing and new items by source to be mapped onto releases + const itemsByHash = { ...existingSourceItemsByHash, ...existingHashItemsByHash, ...storedItemsByHash }; + const itemsBySource = { + ...existingSourceItemsBySource, + ...fetchedItems.reduce((acc, item) => ({ ...acc, [item.source]: itemsByHash[item.hash] }), {}), + }; + + logger.info(`Stored batch ${index + 1} with ${fetchedItems.length} of new ${domain} ${role}s`); + + return itemsBySource; + } catch (error) { + logger.error(`Failed to store ${domain} ${role} batch ${index + 1}: ${error.message}`); + + return null; + } + })); + + return itemChunksBySource.reduce((acc, itemChunk) => ({ ...acc, ...itemChunk }), {}); } function extractPrimaryItem(associations, targetId, role, primaryRole, primaryItemsByTargetId) { @@ -319,7 +329,7 @@ async function associateMedia(sourcesByTargetId, mediaBySource, domain, role, pr const primaryAssociations = associationsPerTarget.map(association => association[primaryRole]).filter(Boolean); logger.info(`Associated ${associations.length} ${role}s to ${domain}s`); - logger.info(`Associated ${primaryAssociations.length} extracted ${primaryRole}s to ${domain}s`); + if (primaryRole) logger.info(`Associated ${primaryAssociations.length} extracted ${primaryRole}s to ${domain}s`); return Promise.all([ (associations.length > 0 && knex.raw(`${knex(`${domain}s_${role}s`).insert(associations).toString()} ON CONFLICT DO NOTHING`)), diff --git a/src/releases.js b/src/releases.js index bb6ff6df..5e8a2ca0 100644 --- a/src/releases.js +++ b/src/releases.js @@ -330,6 +330,10 @@ function accumulateMovies(releases) { } async function storeReleaseAssets(releases) { + if (!argv.media) { + return; + } + const releasePostersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.poster] }), {}); const releaseCoversById = releases.reduce((acc, release) => ({ ...acc, [release.id]: release.covers }), {}); const releaseTrailersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.trailer] }), {}); @@ -340,28 +344,30 @@ async function storeReleaseAssets(releases) { }), {}); const [posters, covers] = await Promise.all([ - storeMedia(Object.values(releasePostersById).flat(), 'release', 'poster'), - storeMedia(Object.values(releaseCoversById).flat(), 'release', 'cover'), + argv.posters && storeMedia(Object.values(releasePostersById).flat(), 'release', 'poster'), + argv.covers && storeMedia(Object.values(releaseCoversById).flat(), 'release', 'cover'), ]); // ensure posters are available before fetching supplementary media await Promise.all([ - (posters && associateMedia(releasePostersById, posters, 'release', 'poster')), - (covers && associateMedia(releaseCoversById, covers, 'release', 'cover')), + posters && associateMedia(releasePostersById, posters, 'release', 'poster'), + covers && associateMedia(releaseCoversById, covers, 'release', 'cover'), ]); - const photos = await storeMedia(Object.values(releasePhotosById).flat(), 'release', 'photo'); - if (photos) await associateMedia(releasePhotosById, photos, 'release', 'photo'); + if (argv.photos) { + const photos = await storeMedia(Object.values(releasePhotosById).flat(), 'release', 'photo'); + if (photos) await associateMedia(releasePhotosById, photos, 'release', 'photo'); + } // videos take a long time, fetch last const [trailers, teasers] = await Promise.all([ - storeMedia(Object.values(releaseTrailersById).flat(), 'release', 'trailer'), - storeMedia(Object.values(releaseTeasersById).flat(), 'release', 'teaser'), + argv.trailers && storeMedia(Object.values(releaseTrailersById).flat(), 'release', 'trailer'), + argv.teasers && storeMedia(Object.values(releaseTeasersById).flat(), 'release', 'teaser'), ]); await Promise.all([ - (trailers && associateMedia(releaseTrailersById, trailers, 'release', 'trailer')), - (teasers && associateMedia(releaseTeasersById, teasers, 'release', 'teaser')), + trailers && associateMedia(releaseTrailersById, trailers, 'release', 'trailer'), + teasers && associateMedia(releaseTeasersById, teasers, 'release', 'teaser'), ]); } diff --git a/src/scrape-releases.js b/src/scrape-releases.js index 53208abe..56333332 100644 --- a/src/scrape-releases.js +++ b/src/scrape-releases.js @@ -52,10 +52,20 @@ async function scrapeRelease(source, basicRelease = null, type = 'scene', prefli } if (type === 'scene' && !scraper.fetchScene) { + if (release) { + logger.warn(`The '${site.name}'-scraper cannot fetch individual scenes`); + return null; + } + throw new Error(`The '${site.name}'-scraper cannot fetch individual scenes`); } if (type === 'movie' && !scraper.fetchMovie) { + if (release) { + logger.warn(`The '${site.name}'-scraper cannot fetch individual movies`); + return null; + } + throw new Error(`The '${site.name}'-scraper cannot fetch individual movies`); } @@ -80,7 +90,7 @@ async function scrapeRelease(source, basicRelease = null, type = 'scene', prefli async function scrapeReleases(sources, release = null, type = 'scene', preflight = null) { const scrapedReleases = await Promise.map(sources, async source => scrapeRelease(source, release, type, preflight), { concurrency: 5, - }); + }).filter(Boolean); const curatedReleases = scrapedReleases.map(scrapedRelease => ({ ...scrapedRelease, type })); diff --git a/src/scrape-sites.js b/src/scrape-sites.js index 783b6757..47084573 100644 --- a/src/scrape-sites.js +++ b/src/scrape-sites.js @@ -117,7 +117,7 @@ async function deepFetchReleases(baseReleases, preflight) { return release; } catch (error) { - logger.error(error.stack); + logger.error(`Failed to scrape ${release.url}: ${error}`); return { ...release, diff --git a/src/scrapers/julesjordan.js b/src/scrapers/julesjordan.js index b9d09bf4..106b040c 100644 --- a/src/scrapers/julesjordan.js +++ b/src/scrapers/julesjordan.js @@ -211,8 +211,13 @@ async function scrapeScene(html, url, site) { release.title = $('.title_bar_hilite').text().trim(); - const setIdIndex = html.indexOf('setid:"'); - release.entryId = html.slice(setIdIndex, html.indexOf(',', setIdIndex)).match(/\d+/)[0]; + const entryId = html.match(/showtagform\((\d+)\)/); + + if (entryId) release.entryId = entryId[1]; + else { + const setIdIndex = html.indexOf('setid:"'); + if (setIdIndex) release.entryId = html.slice(setIdIndex, html.indexOf(',', setIdIndex)).match(/\d+/)[0]; + } const dateElement = $('.update_date').text().trim(); const dateComment = $('*') diff --git a/src/scrapers/newsensations.js b/src/scrapers/newsensations.js index 36140306..700426df 100644 --- a/src/scrapers/newsensations.js +++ b/src/scrapers/newsensations.js @@ -1,6 +1,6 @@ 'use strict'; -const { geta, edate } = require('../utils/q'); +const { geta, ed } = require('../utils/q'); function scrapeBlockLatest(scenes) { return scenes.map(({ html, q, qa, qu, qt }) => { @@ -13,7 +13,7 @@ function scrapeBlockLatest(scenes) { release.title = q('h4 a', true); release.url = qu('h4 a'); - release.date = edate(html, 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/); + release.date = ed(html, 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/); release.actors = qa('.tour_update_models a', true); @@ -22,8 +22,6 @@ function scrapeBlockLatest(scenes) { release.teaser = qt(); - console.log(release); - return release; }); } @@ -52,8 +50,6 @@ function scrapeClassicLatest(scenes) { const photoCount = q('.update_thumb', 'cnt'); [release.poster, ...release.photos] = Array.from({ length: photoCount }).map((value, index) => q('.update_thumb', `src${index}_3x`) || q('.update_thumb', `src${index}_2x`) || q('.update_thumb', `src${index}_1x`)); - console.log(release); - return release; }); } diff --git a/src/utils/chunk.js b/src/utils/chunk.js new file mode 100644 index 00000000..32f1d923 --- /dev/null +++ b/src/utils/chunk.js @@ -0,0 +1,8 @@ +'use strict'; + +function chunk(array, chunkSize) { + return Array.from({ length: Math.ceil(array.length / chunkSize) }) + .map((value, index) => array.slice(index * chunkSize, (index * chunkSize) + chunkSize)); +} + +module.exports = chunk; diff --git a/src/utils/http.js b/src/utils/http.js new file mode 100644 index 00000000..ed7398d0 --- /dev/null +++ b/src/utils/http.js @@ -0,0 +1,76 @@ +'use strict'; + +const bhttp = require('bhttp'); +const taskQueue = require('promise-task-queue'); + +const logger = require('../logger')(__filename); + +const queue = taskQueue(); + +queue.on('concurrencyReached:httpGet', () => { + logger.silly('Queueing GET requests'); +}); + +queue.on('concurrencyReached:httpPost', () => { + logger.silly('Queueing POST requests'); +}); + +queue.define('httpGet', async ({ + url, + timeout = 30000, + options = {}, +}) => { + logger.silly(`GET ${url}`); + + const res = await bhttp.get(url, { + responseTimeout: timeout, + ...options, + }); + + res.code = res.statusCode; + + return res; +}, { + concurrency: 20, +}); + +queue.define('httpPost', async ({ + url, + body, + timeout = 30000, + options = {}, +}) => { + logger.silly(`POST ${url} with ${body}`); + + const res = await bhttp.post(url, body, { + responseTimeout: timeout, + ...options, + }); + + res.code = res.statusCode; + + return res; +}, { + concurrency: 20, +}); + +async function get(url, options) { + return queue.push('httpGet', { + method: 'get', + url, + options, + }); +} + +async function post(url, body, options) { + return queue.push('httpPost', { + url, + body, + options, + }); +} + +module.exports = { + get, + post, +}; diff --git a/src/utils/q.js b/src/utils/q.js index 11388608..dacae2ca 100644 --- a/src/utils/q.js +++ b/src/utils/q.js @@ -2,7 +2,7 @@ const { JSDOM } = require('jsdom'); const moment = require('moment'); -const bhttp = require('bhttp'); +const http = require('./http'); function trim(str) { return str.trim().replace(/\s+/g, ' '); @@ -228,7 +228,7 @@ function extractAll(html, selector) { } async function get(url, selector, headers, all = false) { - const res = await bhttp.get(url, { + const res = await http.get(url, { headers, });