Added basic filename copy. Added HTTP helper to q. Fetching all actor release pages from Naughty America. Added various high res network logos.

This commit is contained in:
2020-02-04 00:18:53 +01:00
parent bffa6d2c9e
commit ef602a3a15
42 changed files with 1483 additions and 54 deletions

View File

@@ -437,19 +437,18 @@ async function scrapeBasicActors() {
}
async function associateActors(mappedActors, releases) {
const actorNames = Object.keys(mappedActors);
const actorSlugs = actorNames.map(name => slugify(name));
const actorMap = Object.keys(mappedActors).reduce((acc, actorName) => ({ ...acc, [actorName]: slugify(actorName) }), {});
const [existingActorEntries, existingAssociationEntries] = await Promise.all([
knex('actors')
.whereIn('name', actorNames)
.orWhereIn('slug', actorSlugs),
.whereIn('name', Object.keys(actorMap))
.orWhereIn('slug', Object.values(actorMap)),
knex('releases_actors').whereIn('release_id', releases.map(release => release.id)),
]);
const associations = await Promise.map(Object.entries(mappedActors), async ([actorName, releaseIds]) => {
try {
const actorEntry = existingActorEntries.find(actor => actor.name === actorName)
const actorEntry = existingActorEntries.find(actor => actor.slug === actorMap[actorName])
|| await storeActor({ name: actorName });
return releaseIds
@@ -469,7 +468,7 @@ async function associateActors(mappedActors, releases) {
await knex('releases_actors').insert(associations.filter(association => association).flat());
// basic actor scraping is failure prone, don't together with actor association
// basic actor scraping is failure prone, don't run together with actor association
// await scrapebasicactors(),
}

View File

@@ -6,7 +6,7 @@ const cheerio = require('cheerio');
const moment = require('moment');
const slugify = require('../utils/slugify');
const { ex } = require('../utils/q');
const { ex, get } = require('../utils/q');
function titleExtractor(pathname) {
const components = pathname.split('/')[2].split('-');
@@ -100,7 +100,13 @@ function scrapeScene(html, url, site) {
};
}
function scrapeProfile(html) {
async function fetchActorReleases(url) {
const { qus } = await get(url);
return qus('.contain-block:not(.live-scenes) .scene-item > a:first-child'); // live scenes repeat on all pages
}
async function scrapeProfile(html) {
const { q, qus } = ex(html);
const profile = {};
@@ -109,7 +115,11 @@ function scrapeProfile(html) {
const avatar = q('img.performer-pic', 'src');
if (avatar) profile.avatar = `https:${avatar}`;
profile.releases = qus('.scene-item > a:first-child');
const releases = qus('.scene-item > a:first-child');
const otherPages = qus('.pagination a:not([rel=next]):not([rel=prev])');
const olderReleases = await Promise.all(otherPages.map(async page => fetchActorReleases(page)));
profile.releases = releases.concat(olderReleases.flat());
return profile;
}

View File

@@ -28,7 +28,7 @@ async function fetchPhotos(url) {
return [];
}
function scrapeAll(html) {
function scrapeAll(html, site) {
return exa(html, '.container .video, .container-fluid .video').map(({ q, qa, qd, ql }) => {
const release = {};
@@ -45,8 +45,8 @@ function scrapeAll(html) {
release.date = qd('.i-date', 'MMM DD', /\w+ \d{1,2}$/)
|| qd('.dt-box', 'MMM.DD YYYY');
release.actors = qa('.model, .i-model', true);
release.duration = ql('.i-amount');
release.actors = site.parameters?.actors || qa('.model, .i-model', true);
release.duration = ql('.i-amount, .amount');
const posterEl = q('.item-img img');
@@ -64,20 +64,40 @@ function scrapeAll(html) {
}).filter(Boolean);
}
async function scrapeScene(html, url) {
const { q, qa, qtext, qi, qd, ql, qu, qis, qp, qt } = ex(html, '#videos-page, #content');
async function scrapeScene(html, url, site) {
const { q, qa, qtext, qi, qd, ql, qu, qis, qp } = ex(html, '#videos-page, #content');
const release = {};
[release.entryId] = new URL(url).pathname.split('/').slice(-2);
release.title = q('h2.text-uppercase, h2.title, #breadcrumb-top + h1', true);
release.title = q('h2.text-uppercase, h2.title, #breadcrumb-top + h1', true)
|| q('h1.m-title', true)?.split('»').slice(-1)[0].trim();
release.description = qtext('.p-desc, .desc');
release.actors = qa('.value a[href*=models], .value a[href*=performer], .value a[href*=teen-babes]', true);
if (release.actors.length === 0) {
const actorEl = qa('.stat').find(stat => /Featuring/.test(stat.textContent))
const actorString = qtext(actorEl);
console.log(actorString);
/*
?.split(/, and|,/g)
.map(actor => actor.trim())
|| [];
*/
}
console.log(release.actors);
if (release.actors.length === 0) release.actors = site.parameters?.actors;
release.tags = qa('a[href*=tag]', true);
const dateEl = qa('.value').find(el => /\w+ \d+\w+, \d{4}/.test(el.textContent));
release.date = qd(dateEl, null, 'MMMM Do, YYYY');
release.date = qd(dateEl, null, 'MMMM Do, YYYY')
|| qd('.date', 'MMMM Do, YYYY', /\w+ \d{1,2}\w+, \d{4}/)
|| qd('.info .holder', 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/);
const durationEl = qa('value').find(el => /\d{1,3}:\d{2}/.test(el.textContent));
release.duration = ql(durationEl);
@@ -94,25 +114,23 @@ async function scrapeScene(html, url) {
]);
}
const trailer = qt();
const trailers = qa('a[href*=Trailers]');
if (trailer) {
release.trailer = [
{
// don't rely on trailer always being 720p by default
src: trailer.replace(/\d+p\.mp4/, '720p.mp4'),
quality: 720,
},
{
src: trailer.replace(/\d+p\.mp4/, '360p.mp4'),
quality: 360,
},
];
if (trailers) {
release.trailer = trailers.map((trailer) => {
const src = `https:${trailer.href}`;
const format = trailer.textContent.trim().match(/^\w+/)[0].toLowerCase();
const quality = parseInt(trailer.textContent.trim().match(/\d+([a-zA-Z]+)?$/)[0], 10);
return format === 'mp4' ? { src, quality } : null;
}).filter(Boolean);
}
const stars = q('.rate-box').dataset.score;
if (stars) release.rating = { stars };
console.log(release);
return release;
}

View File

@@ -60,21 +60,24 @@ function destructConfigNetworks(networks = []) {
}
async function findSiteByUrl(url) {
const { hostname } = new URL(url);
const domain = hostname.replace(/www.|tour./, '');
const { origin, pathname } = new URL(url);
// const domain = hostname.replace(/www.|tour./, '');
const dirUrl = `${origin}${pathname.split('/').slice(0, 2).join('/')}`; // allow for sites on URI directory
const sites = await knex('sites')
const site = await knex('sites')
.leftJoin('networks', 'sites.network_id', 'networks.id')
.select(
'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
)
.where('sites.url', 'like', `%${domain}`)
.orWhere('sites.url', 'like', url)
.orWhere('sites.url', url);
.where('sites.url', url)
.orWhere('sites.url', origin)
.orWhere('sites.url', dirUrl)
// .orWhere('sites.url', 'like', `%${domain}`)
.first();
if (sites.length > 0) {
const curatedSite = curateSite(sites[0], true);
if (site) {
const curatedSite = curateSite(site, true);
return curatedSite;
}

View File

@@ -5,7 +5,6 @@ const path = require('path');
const fs = require('fs-extra');
const argv = require('../argv');
const knex = require('../knex');
async function init() {
@@ -26,6 +25,8 @@ async function init() {
return file;
}));
knex.destroy();
}
init();

View File

@@ -2,6 +2,7 @@
const { JSDOM } = require('jsdom');
const moment = require('moment');
const bhttp = require('bhttp');
function prefixProtocol(url, protocol = 'https') {
if (protocol && /^\/\//.test(url)) {
@@ -22,7 +23,7 @@ function q(context, selector, attrArg, trim = true) {
return trim ? value?.trim() : value;
}
return context.querySelector(selector);
return selector ? context.querySelector(selector) : context;
}
function qall(context, selector, attrArg, trim = true) {
@@ -36,7 +37,7 @@ function qall(context, selector, attrArg, trim = true) {
}
function qtext(context, selector, trim = true) {
const el = q(context, selector, false, trim);
const el = q(context, selector, null, trim);
if (!el) return null;
const text = Array.from(el.childNodes)
@@ -147,7 +148,7 @@ const funcs = {
qus: qurls,
};
function ctx(element, window) {
function init(element, window) {
if (!element) return null;
const contextFuncs = Object.entries(funcs) // dynamically attach methods with context
@@ -166,30 +167,58 @@ function ctx(element, window) {
};
}
function ctxa(context, selector, window) {
return Array.from(context.querySelectorAll(selector)).map(element => ctx(element, window));
function initAll(context, selector, window) {
return Array.from(context.querySelectorAll(selector))
.map(element => init(element, window));
}
function ex(html, selector) {
function extract(html, selector) {
const { window } = new JSDOM(html);
if (selector) {
return ctx(window.document.querySelector(selector), window);
return init(window.document.querySelector(selector), window);
}
return ctx(window.document, window);
return init(window.document, window);
}
function exa(html, selector) {
function extractAll(html, selector) {
const { window } = new JSDOM(html);
return ctxa(window.document, selector, window);
return initAll(window.document, selector, window);
}
async function get(url, selector, headers, all = false) {
const res = await bhttp.get(url, {
headers,
});
if (res.statusCode === 200) {
return all
? extractAll(res.body.toString(), selector)
: extract(res.body.toString(), selector);
}
return null;
}
async function getAll(url, selector, headers) {
return get(url, selector, headers, true);
}
module.exports = {
ex,
exa,
ctx,
ctxa,
extract,
extractAll,
init,
initAll,
get,
getAll,
context: init,
contextAll: initAll,
ex: extract,
exa: extractAll,
ctx: init,
ctxa: initAll,
geta: getAll,
...funcs,
};