Refactored Score.
This commit is contained in:
@@ -36,7 +36,7 @@ function curateEntity(entity, includeParameters = false) {
|
||||
id: entity.id,
|
||||
name: entity.name,
|
||||
url: entity.url,
|
||||
origin: new URL(entity.url).origin,
|
||||
origin: entity.url && new URL(entity.url).origin,
|
||||
description: entity.description,
|
||||
slug: entity.slug,
|
||||
type: entity.type,
|
||||
|
||||
@@ -189,7 +189,7 @@ function scrapeProfile({ query }) {
|
||||
const birthday = new Date(Date.UTC(0, Number(month) - 1, Number(day)));
|
||||
|
||||
if (profile.age) {
|
||||
birthday.setUTCFullYear(new Date().getFullYear() - profile.age); // indicate birth year is unknown
|
||||
birthday.setUTCFullYear(new Date().getFullYear() - profile.age);
|
||||
} else {
|
||||
birthday.setUTCFullYear(0); // indicate birth year is unknown
|
||||
}
|
||||
|
||||
@@ -1,253 +1,276 @@
|
||||
'use strict';
|
||||
|
||||
const { ex, exa, get } = require('../utils/q');
|
||||
const unprint = require('unprint');
|
||||
|
||||
const slugify = require('../utils/slugify');
|
||||
const http = require('../utils/http');
|
||||
const { heightToCm, lbsToKg } = require('../utils/convert');
|
||||
const { stripQuery } = require('../utils/url');
|
||||
const { convert } = require('../utils/convert');
|
||||
|
||||
function scrapePhotos(html) {
|
||||
const { qis } = ex(html, '#photos-page');
|
||||
const photos = qis('img');
|
||||
const sizeRegex = /_lg|_xl|_tn/;
|
||||
|
||||
return photos.map((photo) => [
|
||||
photo
|
||||
.replace('x_800', 'x_xl')
|
||||
.replace('_tn', ''),
|
||||
photo,
|
||||
]);
|
||||
}
|
||||
|
||||
async function fetchPhotos(url) {
|
||||
const res = await http.get(url);
|
||||
|
||||
if (res.statusCode === 200) {
|
||||
return scrapePhotos(res.body.toString(), url);
|
||||
function resizeSrc(src) {
|
||||
if (!src) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return [];
|
||||
return Array.from(new Set([
|
||||
src.replace(sizeRegex, '_1280'),
|
||||
src.replace(sizeRegex, '_800'),
|
||||
src.replace(sizeRegex, '_xl'),
|
||||
src,
|
||||
]));
|
||||
}
|
||||
|
||||
function scrapeAll(html, site) {
|
||||
return exa(html, '.container .video, .container-fluid .video').map(({ q, qa, qd, ql }) => {
|
||||
function scrapeAll(scenes, channel, parameters) {
|
||||
return scenes.map(({ query }) => {
|
||||
const release = {};
|
||||
const poster = query.img('.item-img img');
|
||||
|
||||
release.title = q('.title, .i-title', true);
|
||||
const url = stripQuery(query.url('a.i-title, .item-img a'));
|
||||
|
||||
const linkEl = q('a');
|
||||
const url = new URL(linkEl.href);
|
||||
release.url = `${url.origin}${url.pathname}`;
|
||||
release.title = query.content('a.i-title, h2.i-title');
|
||||
release.duration = query.duration('.time-ol');
|
||||
|
||||
// this is a photo album, not a scene (used for profiles)
|
||||
if (/photos\//.test(url)) return null;
|
||||
release.date = query.date('.i-date', ['MMM. Do', 'MMM. YYYY'], { match: /(\w+\.? \d{1,2}\w+)|(\w+\.? \d{4})/ });
|
||||
|
||||
[release.entryId] = url.pathname.split('/').slice(-2);
|
||||
if (!release.date) {
|
||||
const date = query.dateAgo('.i-date');
|
||||
|
||||
release.date = qd('.i-date', 'MMM DD', /\w+ \d{1,2}$/)
|
||||
|| qd('.dt-box', 'MMM.DD YYYY');
|
||||
release.actors = site?.parameters?.actors || qa('.model, .i-model', true);
|
||||
release.duration = ql('.i-amount, .amount');
|
||||
|
||||
const posterEl = q('.item-img img');
|
||||
|
||||
if (posterEl) {
|
||||
release.poster = `https:${posterEl.src}`;
|
||||
if (date) {
|
||||
release.date = date.date;
|
||||
release.datePrecision = date.precision === 'week' ? 'month' : date.precision;
|
||||
}
|
||||
}
|
||||
|
||||
if (posterEl?.dataset.gifPreview) {
|
||||
release.teaser = {
|
||||
src: `https:${posterEl.dataset.gifPreview}`,
|
||||
};
|
||||
release.actors = query.content('.i-model').split(',').map((actor) => actor.trim());
|
||||
|
||||
if (url.includes('join.') || url.includes('/join')) {
|
||||
// no link available, attempt to reconstruct from poster URL
|
||||
const entryId = poster?.match(/posting_(\d+)/)?.[1];
|
||||
|
||||
if (entryId) {
|
||||
// we can get deep data from this
|
||||
release.entryId = entryId;
|
||||
release.url = `${channel.origin}${parameters.path}/${slugify(release.actors[0], '-', { lower: false })}/${entryId}/`;
|
||||
} else {
|
||||
// lost cause, make up entryId to register shallow data
|
||||
release.entryId = slugify(release.title);
|
||||
}
|
||||
} else {
|
||||
release.url = url;
|
||||
release.entryId = new URL(release.url).pathname.match(/\/(\d+)\/?$/)[1];
|
||||
}
|
||||
|
||||
if (poster) {
|
||||
const caps = Array.from(new Set(Array.from({ length: 6 }, (_src, index) => {
|
||||
const file = `${String(index + 1).padStart(2, '0')}_lg`;
|
||||
|
||||
return poster.replace(/0\d_lg/, file);
|
||||
}))).map((src) => resizeSrc(src));
|
||||
|
||||
release.poster = Array.from({ length: caps[0].length }).flatMap((_value, index) => caps.map((src) => src[index])); // try all the best sources first
|
||||
|
||||
if (caps.length > 1) {
|
||||
release.caps = caps;
|
||||
}
|
||||
}
|
||||
|
||||
release.photos = query.imgs('.thumbs img'); // cards layout
|
||||
|
||||
release.teaser = [
|
||||
query.video('.preview-clip source[type="video/mp4"]'),
|
||||
query.video('.preview-clip source[type="video/webm"]'),
|
||||
].filter(Boolean);
|
||||
|
||||
return release;
|
||||
}).filter(Boolean);
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeScene(html, url, site) {
|
||||
const { qu } = ex(html, '#videos-page, #content section');
|
||||
async function fetchLatest(channel, page = 1, { parameters }) {
|
||||
const res = await unprint.get(`${channel.origin}${parameters.path}/?page=${page}`, {
|
||||
interface: 'request', // seemingly less prone to HTTPParserError: Response does not match the HTTP/1.1 protocol (Invalid character in chunk size)
|
||||
selectAll: '.videos .video, .video-wide', // video-wide for cards layout e.g. Big Boobs POV
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.context, channel, parameters);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
function scrapeScene({ query }, url) {
|
||||
const release = {};
|
||||
|
||||
[release.entryId] = new URL(url).pathname.split('/').slice(-2);
|
||||
const info = Object.fromEntries(query.all('.stat').map((infoEl) => [
|
||||
slugify(unprint.query.content(infoEl, '.label')),
|
||||
unprint.query.content(infoEl, '.value'),
|
||||
]));
|
||||
|
||||
release.title = qu.q('h2.text-uppercase, h2.title, #breadcrumb-top + h1', true)
|
||||
|| qu.q('h1.m-title', true)?.split(/»|\//).slice(-1)[0].trim();
|
||||
release.description = qu.text('.p-desc, .desc');
|
||||
release.url = stripQuery(url);
|
||||
release.entryId = new URL(url).pathname.match(/\/(\d+)\/?$/)[1];
|
||||
|
||||
release.actors = qu.all('.value a[href*=models], .value a[href*=performer], .value a[href*=teen-babes]', true);
|
||||
release.title = query.content('.p-desc h2, #videos_page-page h1');
|
||||
release.description = query.text('.p-desc, .desc');
|
||||
|
||||
if (release.actors.length === 0) {
|
||||
const actorEl = qu.all('.stat').find((stat) => /Featuring/.test(stat.textContent));
|
||||
const actorString = qu.text(actorEl);
|
||||
release.date = unprint.extractDate(info.date, 'MMMM Do, YYYY', { match: /\w+ \d{1,2}\w+, \d{4}/ });
|
||||
release.duration = unprint.extractDuration(info.duration) || Number(info.duration) * 60 || null;
|
||||
|
||||
release.actors = actorString?.split(/,\band\b|,/g).map((actor) => actor.trim()) || [];
|
||||
}
|
||||
release.actors = query.all('//span[contains(text(), "Featuring")]/following-sibling::span/a').map((actorEl) => ({
|
||||
name: unprint.query.content(actorEl),
|
||||
url: stripQuery(unprint.query.url(actorEl, null)),
|
||||
}));
|
||||
|
||||
if (release.actors.length === 0 && site.parameters?.actors) release.actors = site.parameters.actors;
|
||||
release.tags = query.contents('.p-desc a[href*="tag/"], .desc a[href*="tag/"]');
|
||||
|
||||
release.tags = qu.all('a[href*=tag]', true);
|
||||
const style = query.content('.vp style');
|
||||
const poster = query.img('#videos_page-page .item-img img') || style?.match(/background-image: url\('(http[\w.:/_-]+)'\);/)?.[1];
|
||||
const fallbackPoster = resizeSrc(query.img('meta[itemprop="image"]', { attribute: 'content' })); // usually a different image
|
||||
|
||||
const dateEl = qu.all('.value').find((el) => /\w+ \d+\w+, \d{4}/.test(el.textContent));
|
||||
release.date = qu.date(dateEl, null, 'MMMM Do, YYYY')
|
||||
|| qu.date('.date', 'MMMM Do, YYYY', /\w+ \d{1,2}\w+, \d{4}/)
|
||||
|| qu.date('.info .holder', 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/);
|
||||
const photos = query.all('.gallery .thumb').map((imgEl) => {
|
||||
const link = unprint.query.url(imgEl, 'a');
|
||||
const img = unprint.query.img(imgEl, 'img');
|
||||
const isJoin = !link || link.includes('join.') || link.includes('/join');
|
||||
|
||||
const durationEl = qu.all('value').find((el) => /\d{1,3}:\d{2}/.test(el.textContent));
|
||||
release.duration = qu.dur(durationEl);
|
||||
return Array.from(new Set([
|
||||
...isJoin ? [] : [link],
|
||||
img.replace('_tn', ''),
|
||||
img,
|
||||
]));
|
||||
});
|
||||
|
||||
release.poster = qu.poster('video') || qu.img('.flowplayer img') || html.match(/posterImage: '(.*\.jpg)'/)?.[1] || null; // _800.jpg is larger than _xl.jpg in landscape
|
||||
const photosUrl = qu.url('.stat a[href*=photos]');
|
||||
if (poster) {
|
||||
release.poster = resizeSrc(poster);
|
||||
|
||||
if (photosUrl) {
|
||||
release.photos = await fetchPhotos(photosUrl);
|
||||
if (fallbackPoster?.includes(poster)) {
|
||||
release.photos = [fallbackPoster, ...photos]; // fallback poster isn't usually in photoset, append
|
||||
} else {
|
||||
release.photos = photos;
|
||||
}
|
||||
} else {
|
||||
release.photos = qu.imgs('img[src*=ThumbNails], .p-photos .tn img').map((photo) => [
|
||||
photo.replace('_tn', ''),
|
||||
photo,
|
||||
]);
|
||||
release.poster = fallbackPoster;
|
||||
release.photos = photos;
|
||||
}
|
||||
|
||||
const trailers = qu.all('a[href*=Trailers]');
|
||||
|
||||
if (trailers) {
|
||||
release.trailer = trailers.map((trailer) => {
|
||||
const src = `https:${trailer.href}`;
|
||||
const format = trailer.textContent.trim().match(/^\w+/)[0].toLowerCase();
|
||||
const quality = parseInt(trailer.textContent.trim().match(/\d+([a-zA-Z]+)?$/)[0], 10);
|
||||
|
||||
return format === 'mp4' ? { src, quality } : null;
|
||||
}).filter(Boolean);
|
||||
}
|
||||
|
||||
const stars = qu.q('.rate-box').dataset.score;
|
||||
if (stars) release.rating = { stars };
|
||||
release.trailer = query.all('.vp video source').map((videoEl) => ({
|
||||
src: unprint.query.video(videoEl, null),
|
||||
quality: parseInt(unprint.query.attribute(videoEl, null, 'res'), 10) || null,
|
||||
}));
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
function scrapeModels(html, actorName) {
|
||||
const { qa } = ex(html);
|
||||
const model = qa('.model a').find((link) => link.title === actorName);
|
||||
|
||||
return model?.href || null;
|
||||
}
|
||||
|
||||
async function fetchActorReleases(url, accReleases = []) {
|
||||
const res = await get(url);
|
||||
async function fetchScene(url, channel, baseRelease) {
|
||||
const res = await unprint.get(url, {
|
||||
interface: 'request',
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
const releases = accReleases.concat(scrapeAll(res.item.document.body.outerHTML));
|
||||
const nextPage = res.item.qu.url('.next-pg');
|
||||
|
||||
if (nextPage && new URL(nextPage).searchParams.has('page')) { // last page has 'next' button linking to join page
|
||||
return fetchActorReleases(nextPage, releases);
|
||||
}
|
||||
|
||||
return releases;
|
||||
return scrapeScene(res.context, url, channel, baseRelease);
|
||||
}
|
||||
|
||||
return null;
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function scrapeProfile(html, actorUrl, withReleases) {
|
||||
const { q, qa, qi } = ex(html, '#model-page');
|
||||
const profile = { gender: 'female' };
|
||||
function scrapeProfile({ query }, url) {
|
||||
const profile = { url };
|
||||
const { pathname } = new URL(url);
|
||||
|
||||
const bio = qa('.stat').reduce((acc, el) => {
|
||||
const prop = q(el, '.label', true).slice(0, -1);
|
||||
const key = slugify(prop, '_');
|
||||
const value = q(el, '.value', true);
|
||||
const bio = Object.fromEntries(query.all('.m-info .stat').map((bioEl) => [
|
||||
slugify(unprint.query.content(bioEl, '.label'), '_'),
|
||||
unprint.query.content(bioEl, '.value'),
|
||||
]));
|
||||
|
||||
return {
|
||||
...acc,
|
||||
[key]: value,
|
||||
};
|
||||
}, {});
|
||||
|
||||
if (bio.location) profile.residencePlace = bio.location.replace('Czech Repulic', 'Czech Republic'); // see Laura Lion
|
||||
|
||||
if (bio.birthday) {
|
||||
const birthMonth = bio.birthday.match(/^\w+/)[0].toLowerCase();
|
||||
const [birthDay] = bio.birthday.match(/\d+/);
|
||||
|
||||
profile.birthday = [birthMonth, birthDay]; // currently unused, not to be confused with birthdate
|
||||
if (pathname.includes('big-boob-models')) {
|
||||
profile.gender = 'female';
|
||||
}
|
||||
|
||||
if (bio.ethnicity) profile.ethnicity = bio.ethnicity;
|
||||
if (bio.hair_color) profile.hair = bio.hair_color;
|
||||
if (pathname.includes('male-performer')) {
|
||||
profile.gender = 'male';
|
||||
}
|
||||
|
||||
if (bio.height) profile.height = heightToCm(bio.height);
|
||||
if (bio.weight) profile.weight = lbsToKg(bio.weight);
|
||||
profile.avatar = query.img('.item-img a img:not([src*="posting"])');
|
||||
|
||||
if (bio.bra_size) profile.bust = bio.bra_size;
|
||||
if (bio.measurements) [, profile.waist, profile.hip] = bio.measurements.split('-');
|
||||
profile.placeOfResidence = bio.location;
|
||||
profile.ethnicity = bio.ethnicity;
|
||||
|
||||
if (bio.occupation) profile.occupation = bio.occupation;
|
||||
profile.height = convert(bio.height, 'cm');
|
||||
profile.weight = convert(bio.weight, 'lb', 'kg');
|
||||
|
||||
const avatar = qi('img');
|
||||
if (avatar) profile.avatar = avatar;
|
||||
if (bio.bra_size && bio.measurements) {
|
||||
profile.measurements = bio.measurements.replace(/^\d+-/, `${bio.bra_size}-`);
|
||||
} else {
|
||||
profile.measurements = bio.measurements || bio.bra_size;
|
||||
}
|
||||
|
||||
if (withReleases) {
|
||||
const { origin, pathname } = new URL(actorUrl);
|
||||
profile.releases = await fetchActorReleases(`${origin}${pathname}/scenes?page=1`);
|
||||
profile.hairColor = bio.hair_color;
|
||||
|
||||
const birthday = unprint.extractDate(bio.birthday, 'MMMM D', { match: /\w+.?\s+\d{1,2}/ });
|
||||
|
||||
if (birthday) {
|
||||
birthday.setFullYear(0); // indicate birth year is unknown
|
||||
profile.dateOfBirth = birthday;
|
||||
}
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function fetchLatest(site, page = 1) {
|
||||
const latestPath = site.parameters?.path || '/big-boob-videos';
|
||||
const url = `${site.url}${latestPath}?page=${page}`;
|
||||
const res = await http.get(url);
|
||||
|
||||
if (res.statusCode === 200) {
|
||||
return scrapeAll(res.body.toString(), site);
|
||||
async function getActorUrl(actor) {
|
||||
if (actor.url) {
|
||||
return actor.url;
|
||||
}
|
||||
|
||||
return res.statusCode;
|
||||
}
|
||||
const searchRes = await unprint.post('https://www.scoreland.com/search-es/', {
|
||||
keywords: actor.name,
|
||||
's_filters[site]': 'all',
|
||||
's_filters[type]': 'models',
|
||||
}, {
|
||||
interface: 'request',
|
||||
form: true,
|
||||
followRedirects: false,
|
||||
});
|
||||
|
||||
async function fetchScene(url, site) {
|
||||
const res = await http.get(url);
|
||||
const res = await unprint.get(searchRes.headers.location, {
|
||||
interface: 'request',
|
||||
cookies: {
|
||||
cisession: searchRes.cookies.cisession,
|
||||
},
|
||||
// followRedirects: false,
|
||||
selectAll: '.li-item.model',
|
||||
});
|
||||
|
||||
if (res.statusCode === 200) {
|
||||
return scrapeScene(res.body.toString(), url, site);
|
||||
if (res.ok) {
|
||||
const actorEl = res.context.find(({ query }) => slugify(query.content('.i-model')) === actor.slug);
|
||||
const url = actorEl?.query.url('.i-model');
|
||||
|
||||
if (url) {
|
||||
// messy nats link pointing to unpredictable sites, all data seems to be available on scoreland
|
||||
const { pathname } = new URL(url);
|
||||
const actorPath = pathname.match(/\/[\w-]+\/\d+\/?$/);
|
||||
|
||||
if (actorPath) {
|
||||
return `https://www.scoreland.com/big-boob-models${actorPath[0]}`;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function fetchProfile({ name: actorName }, context, include, page = 1, source = 0) {
|
||||
const letter = actorName.charAt(0).toUpperCase();
|
||||
async function fetchProfile(actor) {
|
||||
const url = await getActorUrl(actor);
|
||||
|
||||
const sources = [
|
||||
`https://www.scoreland.com/big-boob-models/browse/${letter}/?page=${page}`,
|
||||
`https://www.50plusmilfs.com/xxx-milf-models/browse/${letter}/?page=${page}`,
|
||||
];
|
||||
if (url) {
|
||||
const res = await unprint.get(url, {
|
||||
interface: 'request',
|
||||
select: '#model-page',
|
||||
});
|
||||
|
||||
const url = sources[source];
|
||||
|
||||
const res = await http.get(url, {
|
||||
followRedirects: false,
|
||||
});
|
||||
|
||||
if (res.statusCode === 200) {
|
||||
const actorUrl = scrapeModels(res.body.toString(), actorName);
|
||||
|
||||
if (actorUrl) {
|
||||
const actorRes = await http.get(actorUrl);
|
||||
|
||||
if (actorRes.statusCode === 200) {
|
||||
return scrapeProfile(actorRes.body.toString(), actorUrl, include.scenes);
|
||||
}
|
||||
|
||||
return null;
|
||||
if (res.ok) {
|
||||
return scrapeProfile(res.context, url);
|
||||
}
|
||||
|
||||
return fetchProfile({ name: actorName }, context, include, page + 1, source);
|
||||
}
|
||||
|
||||
if (sources[source + 1]) {
|
||||
return fetchProfile({ name: actorName }, context, include, 1, source + 1);
|
||||
return res.status;
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
Reference in New Issue
Block a user