Expanded puppeteer options. Fixed Mike Adriano scraper. Fixed convert utility.
This commit is contained in:
@@ -1,88 +1,166 @@
|
||||
'use strict';
|
||||
|
||||
const qu = require('../utils/qu');
|
||||
const http = require('../utils/http');
|
||||
const unprint = require('unprint');
|
||||
|
||||
function scrapeAll(scenes) {
|
||||
const http = require('../utils/http');
|
||||
const slugify = require('../utils/slugify');
|
||||
const { convert } = require('../utils/convert');
|
||||
|
||||
function scrapeAll(scenes, channel) {
|
||||
return scenes.map(({ query }) => {
|
||||
const release = {};
|
||||
|
||||
release.title = query.cnt('h3.title a, .content-title-wrap a');
|
||||
release.url = query.url('h3.title a, .content-title-wrap a');
|
||||
release.title = query.content('h3.title a, .content-title-wrap a');
|
||||
release.url = query.url('h3.title a, h1.title a, .content-title-wrap a', { origin: channel.url });
|
||||
|
||||
const pathname = new URL(release.url).pathname;
|
||||
release.entryId = pathname.match(/\/view\/(\d+)/)?.[1] || pathname.match(/\/view\/([\w-]+)/)?.[1];
|
||||
|
||||
release.description = query.cnt('.desc, .content-description');
|
||||
release.date = query.date('.date, time, .hide', 'Do MMM YYYY');
|
||||
release.entryId = pathname.match(/\/scenes\/([\w-]+)/)?.[1];
|
||||
|
||||
release.actors = query.cnts('h4.models a, .content-models a');
|
||||
release.duration = query.dur('.total-time');
|
||||
release.description = query.content('.desc, .content-description');
|
||||
release.date = query.date('.date, time, .hide', 'Do MMM YYYY', { match: null });
|
||||
|
||||
release.actors = query.contents('h4.models a, .content-models a');
|
||||
release.duration = query.duration('//span[contains(@class, "total-time") and text()[contains(., ":")]]'); // total-time is also used for photo counts on True Anal
|
||||
|
||||
const [poster, ...primaryPhotos] = query.imgs('a img');
|
||||
const secondaryPhotos = query.styles('.thumb-top, .thumb-bottom, .thumb-mouseover', 'background-image').map((style) => style.match(/url\((.*)\)/)[1]);
|
||||
const secondaryPhotos = query.styles('.thumb-top, .thumb-bottom, .thumb-mouseover', { styleAttribute: 'background-image' }).map((style) => style.match(/url\((.*)\)/)?.[1]);
|
||||
|
||||
release.poster = [
|
||||
poster.replace(/-c\d+x\d+/, ''),
|
||||
poster,
|
||||
];
|
||||
|
||||
release.poster = poster;
|
||||
release.photos = primaryPhotos.concat(secondaryPhotos);
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeScene({ query }, url) {
|
||||
async function scrapeScene({ query }, url, channel) {
|
||||
const release = {};
|
||||
|
||||
const pathname = new URL(url).pathname;
|
||||
release.entryId = pathname.match(/\/view\/(\d+)/)?.[1] || pathname.match(/\/view\/([\w-]+)/)?.[1];
|
||||
const data = query.json('#__NEXT_DATA__')?.props?.pageProps?.content;
|
||||
|
||||
release.title = query.cnt('.content-page-info .title');
|
||||
release.description = query.cnt('.content-page-info .desc');
|
||||
release.date = query.date('.content-page-info .date, .content-page-info .hide, .post-date', 'Do MMM YYYY');
|
||||
release.entryId = data?.slug || pathname.match(/\/scenes\/([\w-]+)/)?.[1];
|
||||
|
||||
release.actors = query.cnts('.content-page-info .models a');
|
||||
release.duration = query.dur('.content-page-info .total-time:last-child');
|
||||
release.title = data?.title || query.content('.content-page-info .title');
|
||||
release.description = data?.description || query.content('.content-page-info .desc');
|
||||
release.date = data?.formatted_date
|
||||
? unprint.extractDate(data.formatted_date, 'Do MMM YYYY', { match: null })
|
||||
: query.date('.content-page-info .date, .content-page-info .hide, .post-date', 'Do MMM YYYY', { match: null });
|
||||
|
||||
release.poster = query.poster('.content-page-header video, .content-page-header-inner video') || query.poster('#main-player', 'data-screenshot');
|
||||
release.trailer = query.video('.content-page-header source, .content-page-header-inner source') || query.q('#main-player', 'data-url');
|
||||
release.actors = data?.models_thumbs?.map((actor) => ({
|
||||
name: actor.name,
|
||||
url: actor.slug && `${channel.url}/models/${actor.slug}`,
|
||||
avatar: actor.thumb,
|
||||
}))
|
||||
|| query.elements('.content-page-info .models a').map((actorEl) => ({
|
||||
name: unprint.query(actorEl),
|
||||
url: unprint.url(actorEl, null),
|
||||
}));
|
||||
|
||||
release.duration = data?.seconds_duration || query.duration('.content-page-info .total-time:last-child');
|
||||
|
||||
release.poster = [data?.trailer_screencap, data?.thumb, data?.extra_thumbails?.[0]].filter(Boolean);
|
||||
release.photos = data?.extra_thumbnails?.slice(1); // first photo is poster
|
||||
|
||||
release.trailer = data?.trailer_url || null;
|
||||
release.caps = data?.thumbs;
|
||||
|
||||
release.tags = data?.tags;
|
||||
|
||||
release.qualities = data?.videos && Object.values(data.videos).map((video) => video.height);
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page = 1) {
|
||||
const { host } = new URL(channel.url);
|
||||
const url = `https://tour.${host}/videos?page=${page}`;
|
||||
async function scrapeProfile({ query }) {
|
||||
const profile = {};
|
||||
|
||||
const res = await qu.get(url);
|
||||
const bio = Object.fromEntries(query.all('.model-info li, .model-desc li').map((el) => [
|
||||
slugify(unprint.query.content(el, 'span')),
|
||||
unprint.query.text(el),
|
||||
]));
|
||||
|
||||
if (res.ok) {
|
||||
if (res.item.query.exists('a[href*="stackpath.com"]')) {
|
||||
throw new Error('URL blocked by StackPath');
|
||||
}
|
||||
const avatar = query.img('.model-photo img, img[alt="model"]');
|
||||
|
||||
return scrapeAll(qu.initAll(res.item.el, '.content-item-large, .content-item, .content-border'), channel);
|
||||
if (avatar) {
|
||||
profile.avatar = [
|
||||
avatar.replace(/-\d+x\d+/, ''),
|
||||
avatar,
|
||||
];
|
||||
}
|
||||
|
||||
return res.status;
|
||||
if (bio && Object.keys(bio).length > 0) {
|
||||
profile.description = bio.bio;
|
||||
|
||||
profile.dateOfBirth = bio.birthdate && unprint.extractDate(bio.birthdate, 'YYYY-MM-DD');
|
||||
profile.birthPlace = bio.born;
|
||||
|
||||
profile.measurements = bio.measurements;
|
||||
|
||||
profile.height = convert(bio.height, 'cm');
|
||||
profile.weight = convert(bio.weight, 'lb', 'kg');
|
||||
|
||||
profile.eyes = bio.eyes;
|
||||
profile.hairColor = bio.hair;
|
||||
}
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function fetchUpcoming(channel) {
|
||||
const { host } = new URL(channel.url);
|
||||
const url = `https://tour.${host}`;
|
||||
async function fetchLatestContent(url, parameters) {
|
||||
if (parameters.useBrowser) {
|
||||
const res = await http.get(url, {
|
||||
bypassBrowser: 'shared',
|
||||
bypass: {
|
||||
evaluate: async () => {
|
||||
// images lazy loaded by JS, gradually scroll through page
|
||||
return Array.from(document.querySelectorAll('.content-item ')).reduce(async (chain, el) => {
|
||||
await chain;
|
||||
|
||||
const res = await qu.get(url);
|
||||
return new Promise((resolve) => {
|
||||
el.scrollIntoView();
|
||||
setTimeout(resolve, 20);
|
||||
});
|
||||
}, Promise.resolve());
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
if (res.statusCode !== 200) {
|
||||
return {
|
||||
ok: false,
|
||||
status: res.statusCode,
|
||||
};
|
||||
}
|
||||
|
||||
const context = unprint.init(res.body);
|
||||
|
||||
return {
|
||||
ok: true,
|
||||
status: res.statusCode,
|
||||
context,
|
||||
};
|
||||
}
|
||||
|
||||
const res = await unprint.get(url);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page = 1, { parameters }) {
|
||||
const url = `${channel.url}/scenes?page=${page}`;
|
||||
const res = await fetchLatestContent(url, parameters);
|
||||
|
||||
if (res.ok) {
|
||||
if (res.item.query.exists('a[href*="stackpath.com"]')) {
|
||||
if (res.context.query.exists('a[href*="stackpath.com"]')) {
|
||||
throw new Error('URL blocked by StackPath');
|
||||
}
|
||||
|
||||
const sceneItem = qu.init(res.item.el, '#upcoming-content');
|
||||
|
||||
if (sceneItem) {
|
||||
return scrapeAll([sceneItem], channel);
|
||||
}
|
||||
|
||||
return null;
|
||||
return scrapeAll(unprint.initAll(res.context.query.all('.content-item-large, .content-item, .content-border')), channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
@@ -92,58 +170,40 @@ async function fetchScene(url, channel) {
|
||||
const cookieJar = http.cookieJar();
|
||||
const session = http.session({ cookieJar });
|
||||
|
||||
/* not working
|
||||
const resA = await http.get(url, {
|
||||
session,
|
||||
extract: {
|
||||
runScripts: 'dangerously',
|
||||
},
|
||||
});
|
||||
|
||||
cookieJar.setCookieSync(http.toughCookie.Cookie.parse(resA.document.cookie), url);
|
||||
|
||||
console.log(res.req);
|
||||
*/
|
||||
|
||||
const res = await http.get(url, {
|
||||
session,
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
const item = qu.init(res.document);
|
||||
const context = unprint.init(res.body);
|
||||
|
||||
if (item.query.exists('a[href*="stackpath.com"]')) {
|
||||
if (context.query.exists('a[href*="stackpath.com"]')) {
|
||||
throw new Error('URL blocked by StackPath');
|
||||
}
|
||||
|
||||
return scrapeScene(item, url, channel);
|
||||
return scrapeScene(context, url, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
/* API protected
|
||||
async function fetchProfile({ name: actorName }, context , site) {
|
||||
async function fetchProfile(actor, context) {
|
||||
const session = http.session();
|
||||
|
||||
await http.get(`https://tour.${site.slug}.com`, { session });
|
||||
await http.get(context.channel.url, { session });
|
||||
|
||||
const url = `https://tour.${site.slug}.com/search-preview`;
|
||||
const res = await http.post(url, { q: actorName }, {
|
||||
session,
|
||||
headers: {
|
||||
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
|
||||
origin: `https://tour.${site.slug}.com`,
|
||||
},
|
||||
});
|
||||
const url = `${context.channel.url}/models/${actor.slug}`;
|
||||
const res = await unprint.get(url);
|
||||
|
||||
console.log(res.body.toString());
|
||||
if (res.ok) {
|
||||
return scrapeProfile(res.context, context.channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
*/
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchUpcoming,
|
||||
// fetchProfile,
|
||||
fetchProfile,
|
||||
fetchScene,
|
||||
};
|
||||
|
||||
@@ -211,6 +211,7 @@ const scrapers = {
|
||||
deeplush: nubiles,
|
||||
devilsfilm: famedigital,
|
||||
digitalplayground: aylo,
|
||||
dirtyauditions: mikeadriano,
|
||||
dorcelclub: dorcel,
|
||||
doubleviewcasting: firstanalquest,
|
||||
dtfsluts: fullpornnetwork,
|
||||
|
||||
Reference in New Issue
Block a user