forked from DebaucheryLibrarian/traxxx
Added basic filename copy. Added HTTP helper to q. Fetching all actor release pages from Naughty America. Added various high res network logos.
This commit is contained in:
@@ -437,19 +437,18 @@ async function scrapeBasicActors() {
|
||||
}
|
||||
|
||||
async function associateActors(mappedActors, releases) {
|
||||
const actorNames = Object.keys(mappedActors);
|
||||
const actorSlugs = actorNames.map(name => slugify(name));
|
||||
const actorMap = Object.keys(mappedActors).reduce((acc, actorName) => ({ ...acc, [actorName]: slugify(actorName) }), {});
|
||||
|
||||
const [existingActorEntries, existingAssociationEntries] = await Promise.all([
|
||||
knex('actors')
|
||||
.whereIn('name', actorNames)
|
||||
.orWhereIn('slug', actorSlugs),
|
||||
.whereIn('name', Object.keys(actorMap))
|
||||
.orWhereIn('slug', Object.values(actorMap)),
|
||||
knex('releases_actors').whereIn('release_id', releases.map(release => release.id)),
|
||||
]);
|
||||
|
||||
const associations = await Promise.map(Object.entries(mappedActors), async ([actorName, releaseIds]) => {
|
||||
try {
|
||||
const actorEntry = existingActorEntries.find(actor => actor.name === actorName)
|
||||
const actorEntry = existingActorEntries.find(actor => actor.slug === actorMap[actorName])
|
||||
|| await storeActor({ name: actorName });
|
||||
|
||||
return releaseIds
|
||||
@@ -469,7 +468,7 @@ async function associateActors(mappedActors, releases) {
|
||||
|
||||
await knex('releases_actors').insert(associations.filter(association => association).flat());
|
||||
|
||||
// basic actor scraping is failure prone, don't together with actor association
|
||||
// basic actor scraping is failure prone, don't run together with actor association
|
||||
// await scrapebasicactors(),
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ const cheerio = require('cheerio');
|
||||
const moment = require('moment');
|
||||
|
||||
const slugify = require('../utils/slugify');
|
||||
const { ex } = require('../utils/q');
|
||||
const { ex, get } = require('../utils/q');
|
||||
|
||||
function titleExtractor(pathname) {
|
||||
const components = pathname.split('/')[2].split('-');
|
||||
@@ -100,7 +100,13 @@ function scrapeScene(html, url, site) {
|
||||
};
|
||||
}
|
||||
|
||||
function scrapeProfile(html) {
|
||||
async function fetchActorReleases(url) {
|
||||
const { qus } = await get(url);
|
||||
|
||||
return qus('.contain-block:not(.live-scenes) .scene-item > a:first-child'); // live scenes repeat on all pages
|
||||
}
|
||||
|
||||
async function scrapeProfile(html) {
|
||||
const { q, qus } = ex(html);
|
||||
const profile = {};
|
||||
|
||||
@@ -109,7 +115,11 @@ function scrapeProfile(html) {
|
||||
const avatar = q('img.performer-pic', 'src');
|
||||
if (avatar) profile.avatar = `https:${avatar}`;
|
||||
|
||||
profile.releases = qus('.scene-item > a:first-child');
|
||||
const releases = qus('.scene-item > a:first-child');
|
||||
const otherPages = qus('.pagination a:not([rel=next]):not([rel=prev])');
|
||||
const olderReleases = await Promise.all(otherPages.map(async page => fetchActorReleases(page)));
|
||||
|
||||
profile.releases = releases.concat(olderReleases.flat());
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
@@ -28,7 +28,7 @@ async function fetchPhotos(url) {
|
||||
return [];
|
||||
}
|
||||
|
||||
function scrapeAll(html) {
|
||||
function scrapeAll(html, site) {
|
||||
return exa(html, '.container .video, .container-fluid .video').map(({ q, qa, qd, ql }) => {
|
||||
const release = {};
|
||||
|
||||
@@ -45,8 +45,8 @@ function scrapeAll(html) {
|
||||
|
||||
release.date = qd('.i-date', 'MMM DD', /\w+ \d{1,2}$/)
|
||||
|| qd('.dt-box', 'MMM.DD YYYY');
|
||||
release.actors = qa('.model, .i-model', true);
|
||||
release.duration = ql('.i-amount');
|
||||
release.actors = site.parameters?.actors || qa('.model, .i-model', true);
|
||||
release.duration = ql('.i-amount, .amount');
|
||||
|
||||
const posterEl = q('.item-img img');
|
||||
|
||||
@@ -64,20 +64,40 @@ function scrapeAll(html) {
|
||||
}).filter(Boolean);
|
||||
}
|
||||
|
||||
async function scrapeScene(html, url) {
|
||||
const { q, qa, qtext, qi, qd, ql, qu, qis, qp, qt } = ex(html, '#videos-page, #content');
|
||||
async function scrapeScene(html, url, site) {
|
||||
const { q, qa, qtext, qi, qd, ql, qu, qis, qp } = ex(html, '#videos-page, #content');
|
||||
const release = {};
|
||||
|
||||
[release.entryId] = new URL(url).pathname.split('/').slice(-2);
|
||||
|
||||
release.title = q('h2.text-uppercase, h2.title, #breadcrumb-top + h1', true);
|
||||
release.title = q('h2.text-uppercase, h2.title, #breadcrumb-top + h1', true)
|
||||
|| q('h1.m-title', true)?.split('»').slice(-1)[0].trim();
|
||||
release.description = qtext('.p-desc, .desc');
|
||||
|
||||
release.actors = qa('.value a[href*=models], .value a[href*=performer], .value a[href*=teen-babes]', true);
|
||||
|
||||
if (release.actors.length === 0) {
|
||||
const actorEl = qa('.stat').find(stat => /Featuring/.test(stat.textContent))
|
||||
const actorString = qtext(actorEl);
|
||||
|
||||
console.log(actorString);
|
||||
/*
|
||||
?.split(/, and|,/g)
|
||||
.map(actor => actor.trim())
|
||||
|| [];
|
||||
*/
|
||||
}
|
||||
|
||||
console.log(release.actors);
|
||||
|
||||
if (release.actors.length === 0) release.actors = site.parameters?.actors;
|
||||
|
||||
release.tags = qa('a[href*=tag]', true);
|
||||
|
||||
const dateEl = qa('.value').find(el => /\w+ \d+\w+, \d{4}/.test(el.textContent));
|
||||
release.date = qd(dateEl, null, 'MMMM Do, YYYY');
|
||||
release.date = qd(dateEl, null, 'MMMM Do, YYYY')
|
||||
|| qd('.date', 'MMMM Do, YYYY', /\w+ \d{1,2}\w+, \d{4}/)
|
||||
|| qd('.info .holder', 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/);
|
||||
|
||||
const durationEl = qa('value').find(el => /\d{1,3}:\d{2}/.test(el.textContent));
|
||||
release.duration = ql(durationEl);
|
||||
@@ -94,25 +114,23 @@ async function scrapeScene(html, url) {
|
||||
]);
|
||||
}
|
||||
|
||||
const trailer = qt();
|
||||
const trailers = qa('a[href*=Trailers]');
|
||||
|
||||
if (trailer) {
|
||||
release.trailer = [
|
||||
{
|
||||
// don't rely on trailer always being 720p by default
|
||||
src: trailer.replace(/\d+p\.mp4/, '720p.mp4'),
|
||||
quality: 720,
|
||||
},
|
||||
{
|
||||
src: trailer.replace(/\d+p\.mp4/, '360p.mp4'),
|
||||
quality: 360,
|
||||
},
|
||||
];
|
||||
if (trailers) {
|
||||
release.trailer = trailers.map((trailer) => {
|
||||
const src = `https:${trailer.href}`;
|
||||
const format = trailer.textContent.trim().match(/^\w+/)[0].toLowerCase();
|
||||
const quality = parseInt(trailer.textContent.trim().match(/\d+([a-zA-Z]+)?$/)[0], 10);
|
||||
|
||||
return format === 'mp4' ? { src, quality } : null;
|
||||
}).filter(Boolean);
|
||||
}
|
||||
|
||||
const stars = q('.rate-box').dataset.score;
|
||||
if (stars) release.rating = { stars };
|
||||
|
||||
console.log(release);
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
|
||||
19
src/sites.js
19
src/sites.js
@@ -60,21 +60,24 @@ function destructConfigNetworks(networks = []) {
|
||||
}
|
||||
|
||||
async function findSiteByUrl(url) {
|
||||
const { hostname } = new URL(url);
|
||||
const domain = hostname.replace(/www.|tour./, '');
|
||||
const { origin, pathname } = new URL(url);
|
||||
// const domain = hostname.replace(/www.|tour./, '');
|
||||
const dirUrl = `${origin}${pathname.split('/').slice(0, 2).join('/')}`; // allow for sites on URI directory
|
||||
|
||||
const sites = await knex('sites')
|
||||
const site = await knex('sites')
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.select(
|
||||
'sites.*',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||
)
|
||||
.where('sites.url', 'like', `%${domain}`)
|
||||
.orWhere('sites.url', 'like', url)
|
||||
.orWhere('sites.url', url);
|
||||
.where('sites.url', url)
|
||||
.orWhere('sites.url', origin)
|
||||
.orWhere('sites.url', dirUrl)
|
||||
// .orWhere('sites.url', 'like', `%${domain}`)
|
||||
.first();
|
||||
|
||||
if (sites.length > 0) {
|
||||
const curatedSite = curateSite(sites[0], true);
|
||||
if (site) {
|
||||
const curatedSite = curateSite(site, true);
|
||||
|
||||
return curatedSite;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@ const path = require('path');
|
||||
const fs = require('fs-extra');
|
||||
|
||||
const argv = require('../argv');
|
||||
|
||||
const knex = require('../knex');
|
||||
|
||||
async function init() {
|
||||
@@ -26,6 +25,8 @@ async function init() {
|
||||
|
||||
return file;
|
||||
}));
|
||||
|
||||
knex.destroy();
|
||||
}
|
||||
|
||||
init();
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
const { JSDOM } = require('jsdom');
|
||||
const moment = require('moment');
|
||||
const bhttp = require('bhttp');
|
||||
|
||||
function prefixProtocol(url, protocol = 'https') {
|
||||
if (protocol && /^\/\//.test(url)) {
|
||||
@@ -22,7 +23,7 @@ function q(context, selector, attrArg, trim = true) {
|
||||
return trim ? value?.trim() : value;
|
||||
}
|
||||
|
||||
return context.querySelector(selector);
|
||||
return selector ? context.querySelector(selector) : context;
|
||||
}
|
||||
|
||||
function qall(context, selector, attrArg, trim = true) {
|
||||
@@ -36,7 +37,7 @@ function qall(context, selector, attrArg, trim = true) {
|
||||
}
|
||||
|
||||
function qtext(context, selector, trim = true) {
|
||||
const el = q(context, selector, false, trim);
|
||||
const el = q(context, selector, null, trim);
|
||||
if (!el) return null;
|
||||
|
||||
const text = Array.from(el.childNodes)
|
||||
@@ -147,7 +148,7 @@ const funcs = {
|
||||
qus: qurls,
|
||||
};
|
||||
|
||||
function ctx(element, window) {
|
||||
function init(element, window) {
|
||||
if (!element) return null;
|
||||
|
||||
const contextFuncs = Object.entries(funcs) // dynamically attach methods with context
|
||||
@@ -166,30 +167,58 @@ function ctx(element, window) {
|
||||
};
|
||||
}
|
||||
|
||||
function ctxa(context, selector, window) {
|
||||
return Array.from(context.querySelectorAll(selector)).map(element => ctx(element, window));
|
||||
function initAll(context, selector, window) {
|
||||
return Array.from(context.querySelectorAll(selector))
|
||||
.map(element => init(element, window));
|
||||
}
|
||||
|
||||
function ex(html, selector) {
|
||||
function extract(html, selector) {
|
||||
const { window } = new JSDOM(html);
|
||||
|
||||
if (selector) {
|
||||
return ctx(window.document.querySelector(selector), window);
|
||||
return init(window.document.querySelector(selector), window);
|
||||
}
|
||||
|
||||
return ctx(window.document, window);
|
||||
return init(window.document, window);
|
||||
}
|
||||
|
||||
function exa(html, selector) {
|
||||
function extractAll(html, selector) {
|
||||
const { window } = new JSDOM(html);
|
||||
|
||||
return ctxa(window.document, selector, window);
|
||||
return initAll(window.document, selector, window);
|
||||
}
|
||||
|
||||
async function get(url, selector, headers, all = false) {
|
||||
const res = await bhttp.get(url, {
|
||||
headers,
|
||||
});
|
||||
|
||||
if (res.statusCode === 200) {
|
||||
return all
|
||||
? extractAll(res.body.toString(), selector)
|
||||
: extract(res.body.toString(), selector);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function getAll(url, selector, headers) {
|
||||
return get(url, selector, headers, true);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
ex,
|
||||
exa,
|
||||
ctx,
|
||||
ctxa,
|
||||
extract,
|
||||
extractAll,
|
||||
init,
|
||||
initAll,
|
||||
get,
|
||||
getAll,
|
||||
context: init,
|
||||
contextAll: initAll,
|
||||
ex: extract,
|
||||
exa: extractAll,
|
||||
ctx: init,
|
||||
ctxa: initAll,
|
||||
geta: getAll,
|
||||
...funcs,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user