Added rudimentary movie relations.
This commit is contained in:
@@ -5,7 +5,7 @@ const knex = require('./knex');
|
||||
const initServer = require('./web/server');
|
||||
|
||||
const scrapeSites = require('./scrape-sites');
|
||||
const { scrapeReleases, deepFetchReleases } = require('./scrape-releases');
|
||||
const { scrapeScenes, scrapeMovies, deepFetchReleases } = require('./scrape-releases');
|
||||
const { storeReleases } = require('./releases');
|
||||
const { scrapeActors, scrapeBasicActors } = require('./actors');
|
||||
|
||||
@@ -15,11 +15,11 @@ if (process.env.NODE_ENV === 'development') {
|
||||
|
||||
async function init() {
|
||||
if (argv.scene) {
|
||||
await scrapeReleases(argv.scene, null, 'scene');
|
||||
await scrapeScenes(argv.scene);
|
||||
}
|
||||
|
||||
if (argv.movie) {
|
||||
await scrapeReleases(argv.movie, null, 'movie');
|
||||
await scrapeMovies(argv.movie);
|
||||
}
|
||||
|
||||
if (argv.scrape || argv.networks || argv.sites) {
|
||||
|
||||
19
src/argv.js
19
src/argv.js
@@ -29,12 +29,17 @@ const { argv } = yargs
|
||||
type: 'array',
|
||||
alias: 'actor',
|
||||
})
|
||||
.option('with-releases', {
|
||||
describe: 'Fetch all releases for an actor',
|
||||
.option('with-scenes', {
|
||||
describe: 'Fetch all scenes for an actor or movie',
|
||||
type: 'boolean',
|
||||
alias: 'with-scenes',
|
||||
alias: 'with-releases',
|
||||
default: false,
|
||||
})
|
||||
.option('with-movies', {
|
||||
describe: 'Fetch movies for scenes',
|
||||
type: 'boolean',
|
||||
default: true,
|
||||
})
|
||||
.option('with-profiles', {
|
||||
describe: 'Scrape profiles for new actors after fetching scenes',
|
||||
type: 'boolean',
|
||||
@@ -44,12 +49,12 @@ const { argv } = yargs
|
||||
.option('scene', {
|
||||
describe: 'Scrape scene info from URL',
|
||||
type: 'array',
|
||||
alias: 'release',
|
||||
alias: 'scenes',
|
||||
})
|
||||
.option('movie', {
|
||||
describe: 'Scrape movie info from URL',
|
||||
type: 'array',
|
||||
alias: 'dvd',
|
||||
alias: 'movies',
|
||||
})
|
||||
.option('sources', {
|
||||
describe: 'Use these scrapers for actor data',
|
||||
@@ -121,11 +126,13 @@ const { argv } = yargs
|
||||
describe: 'Include release posters',
|
||||
type: 'boolean',
|
||||
default: true,
|
||||
alias: 'poster',
|
||||
})
|
||||
.option('covers', {
|
||||
describe: 'Include release covers',
|
||||
type: 'boolean',
|
||||
default: true,
|
||||
alias: 'cover',
|
||||
})
|
||||
.option('photos', {
|
||||
describe: 'Include release photos',
|
||||
@@ -136,11 +143,13 @@ const { argv } = yargs
|
||||
describe: 'Include release trailers',
|
||||
type: 'boolean',
|
||||
default: true,
|
||||
alias: 'trailer',
|
||||
})
|
||||
.option('teasers', {
|
||||
describe: 'Include release teasers',
|
||||
type: 'boolean',
|
||||
default: true,
|
||||
alias: 'teaser',
|
||||
})
|
||||
.option('avatars', {
|
||||
describe: 'Include actor avatars',
|
||||
|
||||
@@ -214,7 +214,6 @@ async function curateReleaseEntry(release, batchId, existingRelease) {
|
||||
studio_id: release.studio ? release.studio.id : null,
|
||||
shoot_id: release.shootId || null,
|
||||
entry_id: release.entryId || null,
|
||||
parent_id: release.parentId,
|
||||
type: release.type,
|
||||
url: release.url,
|
||||
title: release.title,
|
||||
@@ -327,21 +326,6 @@ function accumulateActors(releases) {
|
||||
}, {});
|
||||
}
|
||||
|
||||
function accumulateMovies(releases) {
|
||||
return releases.reduce((acc, release) => {
|
||||
if (release.movie) {
|
||||
if (acc[release.movie]) {
|
||||
acc[release.movie] = acc[release.movie].concat(release.id);
|
||||
return acc;
|
||||
}
|
||||
|
||||
acc[release.movie] = [release.id];
|
||||
}
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
}
|
||||
|
||||
async function storeReleaseAssets(releases) {
|
||||
if (!argv.media) {
|
||||
return;
|
||||
@@ -501,7 +485,6 @@ async function storeReleases(releases) {
|
||||
logger.info(`Stored ${storedReleases.length} new releases`);
|
||||
|
||||
const actors = accumulateActors(storedReleases);
|
||||
const movies = accumulateMovies(storedReleases);
|
||||
|
||||
await associateActors(actors, storedReleases);
|
||||
|
||||
@@ -518,7 +501,6 @@ async function storeReleases(releases) {
|
||||
return {
|
||||
releases: storedReleases,
|
||||
actors,
|
||||
movies,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,8 @@ const Promise = require('bluebird');
|
||||
|
||||
const logger = require('./logger')(__filename);
|
||||
const argv = require('./argv');
|
||||
const include = require('./utils/argv-include')(argv);
|
||||
const knex = require('./knex');
|
||||
const scrapers = require('./scrapers/scrapers');
|
||||
const { findSiteByUrl } = require('./sites');
|
||||
const { findNetworkByUrl } = require('./networks');
|
||||
@@ -33,7 +35,7 @@ async function findSite(url, release) {
|
||||
return null;
|
||||
}
|
||||
|
||||
async function scrapeRelease(source, basicRelease = null, type = 'scene', preflight) {
|
||||
async function scrapeRelease(source, basicRelease = null, type = 'scene', beforeFetchLatest) {
|
||||
// profile scraper may return either URLs or pre-scraped scenes
|
||||
const sourceIsUrlOrEmpty = typeof source === 'string' || source === undefined;
|
||||
const url = sourceIsUrlOrEmpty ? source : source?.url;
|
||||
@@ -72,8 +74,8 @@ async function scrapeRelease(source, basicRelease = null, type = 'scene', prefli
|
||||
}
|
||||
|
||||
const scrapedRelease = type === 'scene'
|
||||
? await scraper.fetchScene(url, site, release, preflight)
|
||||
: await scraper.fetchMovie(url, site, release, preflight);
|
||||
? await scraper.fetchScene(url, site, release, beforeFetchLatest, include)
|
||||
: await scraper.fetchMovie(url, site, release, beforeFetchLatest, include);
|
||||
|
||||
return {
|
||||
...release,
|
||||
@@ -85,8 +87,42 @@ async function scrapeRelease(source, basicRelease = null, type = 'scene', prefli
|
||||
};
|
||||
}
|
||||
|
||||
async function scrapeReleases(sources, release = null, type = 'scene', preflight = null) {
|
||||
const scrapedReleases = await Promise.map(sources, async source => scrapeRelease(source, release, type, preflight), {
|
||||
async function accumulateMovies(releases) {
|
||||
if (!argv.withMovies) return [];
|
||||
|
||||
const moviesByUrl = releases.reduce((acc, release) => {
|
||||
if (!release.movie) return acc;
|
||||
const movie = release.movie.url ? release.movie : { url: release.movie };
|
||||
|
||||
if (!acc[movie.url]) {
|
||||
acc[movie.url] = {
|
||||
...movie,
|
||||
type: 'movie',
|
||||
sceneIds: [],
|
||||
};
|
||||
}
|
||||
|
||||
acc[movie.url].sceneIds = acc[movie.url].sceneIds.concat(release.id);
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
const movies = await Promise.map(Object.values(moviesByUrl), async movie => scrapeRelease(movie, null, 'movie'));
|
||||
const { releases: storedMovies } = await storeReleases(movies);
|
||||
|
||||
const movieAssociations = storedMovies.reduce((acc, movie) => acc.concat(movie.sceneIds.map(sceneId => ({
|
||||
movie_id: movie.id,
|
||||
scene_id: sceneId,
|
||||
}))), []);
|
||||
|
||||
await knex('releases_movies').insert(movieAssociations);
|
||||
|
||||
// console.log(moviesByUrl);
|
||||
return movies;
|
||||
}
|
||||
|
||||
async function scrapeReleases(sources, type = 'scene') {
|
||||
const scrapedReleases = await Promise.map(sources, async source => scrapeRelease(source, null, type), {
|
||||
concurrency: 5,
|
||||
}).filter(Boolean);
|
||||
|
||||
@@ -97,26 +133,26 @@ async function scrapeReleases(sources, release = null, type = 'scene', preflight
|
||||
}
|
||||
|
||||
if (argv.save) {
|
||||
/*
|
||||
const movie = scrapedRelease.movie
|
||||
? await scrapeRelease(scrapedRelease.movie, null, false, 'movie')
|
||||
: null;
|
||||
|
||||
if (movie) {
|
||||
const { releases: [storedMovie] } = await storeReleases([movie]);
|
||||
curatedRelease.parentId = storedMovie.id;
|
||||
}
|
||||
*/
|
||||
|
||||
const { releases: storedReleases } = await storeReleases(curatedReleases);
|
||||
const movieScenes = storedReleases.map(movie => movie.scenes).flat();
|
||||
|
||||
// console.log(movieScenes);
|
||||
await accumulateMovies(storedReleases);
|
||||
|
||||
if (storedReleases) {
|
||||
logger.info(storedReleases.map(storedRelease => `\nhttp://${config.web.host}:${config.web.port}/scene/${storedRelease.id}/${storedRelease.slug}`).join(''));
|
||||
}
|
||||
|
||||
return storedReleases;
|
||||
}
|
||||
|
||||
return curatedReleases;
|
||||
}
|
||||
|
||||
async function scrapeScenes(sources) {
|
||||
return scrapeReleases(sources, 'scene');
|
||||
}
|
||||
|
||||
async function scrapeMovies(sources) {
|
||||
return scrapeReleases(sources, 'movie');
|
||||
}
|
||||
|
||||
async function deepFetchReleases(baseReleases, beforeFetchLatest) {
|
||||
@@ -151,13 +187,13 @@ async function deepFetchReleases(baseReleases, beforeFetchLatest) {
|
||||
concurrency: 2,
|
||||
});
|
||||
|
||||
// console.log(deepReleases);
|
||||
|
||||
return deepReleases;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
deepFetchReleases,
|
||||
scrapeMovies,
|
||||
scrapeRelease,
|
||||
scrapeReleases,
|
||||
scrapeScenes,
|
||||
};
|
||||
|
||||
@@ -4,6 +4,7 @@ const Promise = require('bluebird');
|
||||
const moment = require('moment');
|
||||
|
||||
const argv = require('./argv');
|
||||
const include = require('./utils/argv-include')(argv);
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const { fetchIncludedSites } = require('./sites');
|
||||
@@ -42,7 +43,7 @@ async function scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteRel
|
||||
return [];
|
||||
}
|
||||
|
||||
const latestReleases = await scraper.fetchLatest(site, page, beforeFetchLatest, accSiteReleases);
|
||||
const latestReleases = await scraper.fetchLatest(site, page, beforeFetchLatest, accSiteReleases, include);
|
||||
|
||||
if (!Array.isArray(latestReleases)) {
|
||||
logger.warn(`Scraper returned ${latestReleases || 'null'} when fetching latest from '${site.name}' on '${site.network.name}'`);
|
||||
@@ -89,7 +90,7 @@ async function scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteRel
|
||||
|
||||
async function scrapeUpcomingReleases(scraper, site, beforeFetchLatest) {
|
||||
if (argv.upcoming && scraper.fetchUpcoming) {
|
||||
const upcomingReleases = await scraper.fetchUpcoming(site, 1, beforeFetchLatest);
|
||||
const upcomingReleases = await scraper.fetchUpcoming(site, 1, beforeFetchLatest, include);
|
||||
|
||||
return upcomingReleases
|
||||
? upcomingReleases.map(release => ({ ...release, site, upcoming: true }))
|
||||
|
||||
@@ -135,10 +135,10 @@ function getEntryId(html) {
|
||||
}
|
||||
|
||||
function scrapeAll(scenes, site) {
|
||||
return scenes.map(({ qu }) => {
|
||||
return scenes.map(({ el, qu }) => {
|
||||
const release = {};
|
||||
|
||||
release.entryId = qu.el.dataset.setid || qu.q('.rating_box')?.dataset.id;
|
||||
release.entryId = el.dataset.setid || qu.q('.rating_box')?.dataset.id;
|
||||
|
||||
release.url = qu.url('.update_title, .dvd_info > a, a ~ a');
|
||||
release.title = qu.q('.update_title, .dvd_info > a, a ~ a', true);
|
||||
@@ -160,7 +160,7 @@ function scrapeAll(scenes, site) {
|
||||
} : null;
|
||||
}).filter(Boolean);
|
||||
|
||||
const teaserScript = qu.content('script');
|
||||
const teaserScript = qu.html('script');
|
||||
if (teaserScript) {
|
||||
const src = teaserScript.slice(teaserScript.indexOf('http'), teaserScript.indexOf('.mp4') + 4);
|
||||
if (src) release.teaser = { src };
|
||||
@@ -220,17 +220,19 @@ function scrapeUpcoming(html, site) {
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeScene({ qu }, url, site) {
|
||||
async function scrapeScene({ html, qu }, url, site, include) {
|
||||
const release = { url, site };
|
||||
|
||||
release.entryId = getEntryId(qu.html);
|
||||
release.entryId = getEntryId(html);
|
||||
release.title = qu.q('.title_bar_hilite', true);
|
||||
release.description = qu.q('.update_description', true);
|
||||
|
||||
release.date = qu.date('.update_date', 'MM/DD/YYYY', null, 'innerHTML');
|
||||
release.actors = qu.all('.backgroundcolor_info > .update_models a, .item .update_models a', true);
|
||||
|
||||
const posterPath = qu.html.match(/useimage = "(.*)"/)?.[1];
|
||||
release.actors = qu.all('.backgroundcolor_info > .update_models a, .item .update_models a', true);
|
||||
release.tags = qu.all('.update_tags a', true);
|
||||
|
||||
const posterPath = html.match(/useimage = "(.*)"/)?.[1];
|
||||
|
||||
if (posterPath) {
|
||||
const poster = /^http/.test(posterPath) ? posterPath : `${site.url}${posterPath}`;
|
||||
@@ -243,8 +245,8 @@ async function scrapeScene({ qu }, url, site) {
|
||||
}
|
||||
}
|
||||
|
||||
if (site.slug !== 'manuelferrara') {
|
||||
const trailerLines = qu.html.split('\n').filter(line => /movie\["trailer\w*"\]\[/i.test(line));
|
||||
if (include.trailer && site.slug !== 'manuelferrara') {
|
||||
const trailerLines = html.split('\n').filter(line => /movie\["trailer\w*"\]\[/i.test(line));
|
||||
|
||||
if (trailerLines.length) {
|
||||
release.trailer = trailerLines.map((trailerLine) => {
|
||||
@@ -259,8 +261,7 @@ async function scrapeScene({ qu }, url, site) {
|
||||
}
|
||||
}
|
||||
|
||||
release.photos = await getPhotos(release.entryId, site);
|
||||
release.tags = qu.all('.update_tags a', true);
|
||||
if (include.photos) release.photos = await getPhotos(release.entryId, site);
|
||||
|
||||
if (qu.exists('.update_dvds a')) {
|
||||
release.movie = {
|
||||
@@ -275,27 +276,27 @@ async function scrapeScene({ qu }, url, site) {
|
||||
return release;
|
||||
}
|
||||
|
||||
function scrapeMovie({ el, q, qus }, url, site) {
|
||||
function scrapeMovie({ el, qu }, url, site) {
|
||||
const movie = { url, site };
|
||||
|
||||
movie.entryId = q('.dvd_details_overview .rating_box').dataset.id;
|
||||
movie.title = q('.title_bar span', true);
|
||||
movie.covers = qus('#dvd-cover-flip > a');
|
||||
movie.channel = q('.update_date a', true);
|
||||
movie.entryId = qu.q('.dvd_details_overview .rating_box').dataset.id;
|
||||
movie.title = qu.q('.title_bar span', true);
|
||||
movie.covers = qu.urls('#dvd-cover-flip > a');
|
||||
movie.channel = qu.q('.update_date a', true);
|
||||
|
||||
// movie.releases = Array.from(document.querySelectorAll('.cell.dvd_info > a'), el => el.href);
|
||||
const sceneQs = ctxa(el, '.dvd_details');
|
||||
const scenes = scrapeAll(sceneQs, site);
|
||||
const sceneQus = ctxa(el, '.dvd_details');
|
||||
const scenes = scrapeAll(sceneQus, site);
|
||||
|
||||
const curatedScenes = scenes
|
||||
.map(scene => ({ ...scene, movie }))
|
||||
?.map(scene => ({ ...scene, movie }))
|
||||
.sort((sceneA, sceneB) => sceneA.date - sceneB.date);
|
||||
|
||||
movie.date = curatedScenes[0].date;
|
||||
movie.date = curatedScenes?.[0].date;
|
||||
|
||||
return {
|
||||
...movie,
|
||||
scenes: curatedScenes,
|
||||
...(curatedScenes && { scenes: curatedScenes }),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -358,10 +359,10 @@ async function fetchUpcoming(site) {
|
||||
return res.statusCode;
|
||||
}
|
||||
|
||||
async function fetchScene(url, site) {
|
||||
async function fetchScene(url, site, baseRelease, preflight, include) {
|
||||
const res = await get(url);
|
||||
|
||||
return res.ok ? scrapeScene(res.item, url, site) : res.status;
|
||||
return res.ok ? scrapeScene(res.item, url, site, include) : res.status;
|
||||
}
|
||||
|
||||
async function fetchMovie(url, site) {
|
||||
|
||||
@@ -70,7 +70,7 @@ function exists(context, selector) {
|
||||
return !!q(context, selector);
|
||||
}
|
||||
|
||||
function content(context, selector) {
|
||||
function html(context, selector) {
|
||||
const el = q(context, selector, null, true);
|
||||
|
||||
return el && el.innerHTML;
|
||||
@@ -176,8 +176,8 @@ const legacyFuncs = {
|
||||
qall: all,
|
||||
qd: date,
|
||||
qdate: date,
|
||||
qh: content,
|
||||
qhtml: content,
|
||||
qh: html,
|
||||
qhtml: html,
|
||||
qi: image,
|
||||
qimage: image,
|
||||
qimages: images,
|
||||
@@ -207,8 +207,7 @@ const legacyFuncs = {
|
||||
|
||||
const quFuncs = {
|
||||
all,
|
||||
body: content,
|
||||
content,
|
||||
html,
|
||||
date,
|
||||
dur: duration,
|
||||
duration,
|
||||
@@ -217,7 +216,6 @@ const quFuncs = {
|
||||
images,
|
||||
img: image,
|
||||
imgs: images,
|
||||
inner: content,
|
||||
length: duration,
|
||||
meta,
|
||||
poster,
|
||||
|
||||
Reference in New Issue
Block a user