Merge remote-tracking branch 'upstream/master'

Update from Upstream
This commit is contained in:
sampulsar1
2021-02-14 19:51:58 +10:30
93 changed files with 375 additions and 262 deletions

View File

@@ -376,7 +376,7 @@ async function curateProfile(profile, actor) {
curatedProfile.hip = Number(profile.hip) || profile.hip?.match?.(/\d+/)?.[0] || null;
// combined measurement value
const measurements = profile.measurements?.match(/(\d+)(\w+)-(\d+)-(\d+)/);
const measurements = profile.measurements?.match(/(\d+)(\w+)[-x](\d+)[-x](\d+)/); // ExCoGi uses x
if (measurements) {
curatedProfile.bust = Number(measurements[1]);

View File

@@ -435,6 +435,10 @@ async function storeFile(media, options) {
return storeImageFile(media, hashDir, hashSubDir, filename, filedir, filepath, options);
}
if (['posters', 'photos', 'covers'].includes(media.role)) {
throw new Error(`Media for '${media.role}' must be an image, but '${media.meta.mimetype}' was detected`);
}
const [stat] = await Promise.all([
fsPromises.stat(media.file.path),
fsPromises.mkdir(path.join(config.media.path, filedir), { recursive: true }),

299
src/scrapers/elevatedx.js Normal file
View File

@@ -0,0 +1,299 @@
'use strict';
const format = require('template-format');
const qu = require('../utils/q');
const slugify = require('../utils/slugify');
const { convert } = require('../utils/convert');
function deriveEntryId(release) {
if (release.date && release.url) {
const slug = new URL(release.url).pathname.match(/\/trailers\/(.*).html/)[1];
return `${slugify(qu.formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(slug)}`;
}
if (release.date && release.title) {
return `${slugify(qu.formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`;
}
return null;
}
function extractPoster(posterPath, channel, baseRelease) {
if (posterPath && !/400.jpg/.test(posterPath)) {
const poster = qu.prefixUrl(posterPath, channel.parameters?.media || channel.url);
const posterSources = [
poster,
// upscaled
poster.replace('-1x', '-2x'),
poster.replace('-1x', '-3x'),
];
if (baseRelease?.poster) {
return [posterSources, [baseRelease.poster]];
}
return [posterSources, []];
}
return [baseRelease?.poster || null, []];
}
function getImageWithFallbacks(q, selector, site, el) {
const sources = el
? [
q(el, selector, 'src0_3x'),
q(el, selector, 'src0_2x'),
q(el, selector, 'src0_1x'),
]
: [
q(selector, 'src0_3x'),
q(selector, 'src0_2x'),
q(selector, 'src0_1x'),
];
return sources.filter(Boolean).map(src => `${site.parameters?.media || site.url}${src}`);
}
function scrapeAllClassic(scenes, channel) {
return scenes.map(({ query }) => {
const release = {};
release.url = query.url('.updateInfo h5 a:not([href*="content/"]):not([href*="#coming"])');
release.entryId = query.url('.updateThumb img', 'alt');
release.title = query.cnt('.updateInfo h5 a');
release.actors = query.cnts('.tour_update_models a');
release.date = query.date('.availdate, .updateInfo p span:nth-child(2)', 'MM/DD/YYYY');
release.poster = query.img('.updateThumb img');
const trailer = query.q('.updateInfo h5 a', 'onclick')?.match(/'(.+)'/)?.[1];
if (trailer) {
release.trailer = `${channel.url}${trailer}`;
}
return release;
});
}
function scrapeAllTubular(scenes, channel, accNetworkReleases) {
return scenes.map(({ query }) => {
const release = {};
release.title = query.q('h4 a', 'title') || query.q('h4 a', true);
release.url = query.url('h4 a');
release.date = query.date('.more-info-div', 'MMM D, YYYY');
release.duration = query.dur('.more-info-div');
const posterPath = query.q('.img-div img', 'src0_1x') || query.img('img.video_placeholder');
if (posterPath) {
const poster = /^http/.test(posterPath) ? posterPath : `${channel.parameters?.media || channel.url}${posterPath}`;
release.poster = [
poster.replace('-1x', '-3x'),
poster.replace('-1x', '-2x'),
poster,
];
}
release.teaser = query.video();
// release.entryId = q('.img-div img', 'id')?.match(/set-target-(\d+)/)[1];
release.entryId = deriveEntryId(release);
if (channel.parameters?.accFilter && accNetworkReleases?.map(accRelease => accRelease.entryId).includes(release.entryId)) {
// filter out releases that were already scraped from a categorized site, requeryires sequeryential site scraping
return null;
}
return release;
});
}
function scrapeSceneClassic({ query, html }, url, channel) {
const release = {};
release.title = query.q('.updatesBlock h2', true);
release.poster = query.meta('property="og:image"');
release.entryId = release.poster.match(/\/content\/(.*)\//)?.[1];
const trailer = html.match(/src="(.+\.mp4)"/)?.[1];
if (trailer) {
release.trailer = {
src: `${channel.url}${trailer}`,
};
}
return release;
}
function scrapeSceneTubular({ query, html }, entity, url, baseRelease) {
const release = {};
release.title = query.q('.trailer-section-head .section-title, .title-block .section-title', true);
release.description = query.text('.row .update-info-block');
release.date = query.date('.update-info-row', 'MMM D, YYYY', /\w+ \d{1,2}, \d{4}/);
release.duration = query.dur('.update-info-row:nth-child(2)');
release.actors = query.all('.models-list-thumbs a').map(el => ({
name: query.cnt(el, 'span'),
avatar: getImageWithFallbacks(query.q, 'img', entity, el),
url: query.url(el, null),
}));
release.tags = query.all('.tags a', true);
const posterPath = query.q('.player-thumb img', 'src0_1x');
const trailer = html.match(/<video.*src="(.*\.mp4)/)?.[1];
[release.poster, release.photos] = extractPoster(posterPath, entity, baseRelease);
if (trailer) {
release.trailer = { src: qu.prefixUrl(trailer, entity.parameters?.media || entity.url), referer: url };
}
const stars = query.q('.update-rating', true).match(/\d.\d/)?.[0];
if (stars) release.stars = Number(stars);
if (entity.type === 'network') {
const channelRegExp = new RegExp(entity.children.map(channel => channel.parameters?.match || channel.name).join('|'), 'i');
const channel = release.tags.find(tag => channelRegExp.test(tag));
if (channel) {
release.channel = slugify(channel, '');
}
}
release.entryId = deriveEntryId(release);
return release;
}
async function scrapeProfile({ query }, entity, parameters) {
const profile = {};
const bio = query.cnt('.model_bio, .detail-div');
const avatarEl = query.q('.model_bio_pic img, .model_bio_thumb');
profile.age = Number(bio?.match(/Age:\s*(\d{2})/)?.[1]) || null;
profile.dateOfBirth = qu.parseDate(bio?.match(/Age:\s*(\w+ \d{1,2}, \d{4})/)?.[0], 'MMMM D, YYYY');
profile.height = convert(bio?.match(/\d+\s*(feet|')\s*\d+\s*(inches|"|$)/)?.[0], 'cm');
profile.measurements = bio?.match(/\w+[-x]\d+[-x]\d+/)?.[0] || null;
profile.aliases = bio?.match(/also known as:\s*([\w\s]+(,\s*)?)+/i)?.[1].split(/,\s*/) || [];
if (avatarEl) {
const avatarSources = [
avatarEl.getAttribute('src0_3x'),
avatarEl.getAttribute('src0_2x'),
avatarEl.getAttribute('src0_1x'),
avatarEl.getAttribute('src0'),
avatarEl.getAttribute('src'),
]
.filter(avatar => avatar && !/p\d+.jpe?g/.test(avatar)) // remove non-existing attributes and placeholder images
.map(avatar => qu.prefixUrl(avatar, entity.url));
if (avatarSources.length) profile.avatar = avatarSources;
}
if (parameters?.layout === 'classic') {
profile.scenes = scrapeAllClassic(qu.initAll(query.all('.bodyArea .updateItem')), entity);
}
if (parameters?.layout === 'tubular') {
profile.scenes = scrapeAllTubular(qu.initAll(query.all('.modelfeature, .item-video')), entity);
}
return profile;
}
async function fetchLatest(site, page = 1, options, preData, allScraper) {
const url = (site.parameters?.latest && format(site.parameters.latest, { page }))
|| `${site.url}/categories/movies_${page}_d.html`;
const res = await qu.getAll(url, '.modelfeature, .item-video, .bodyArea .updateItem');
if (!res.ok) {
return res.status;
}
return allScraper(res.items, site, preData?.uniqueReleases);
}
async function fetchUpcomingClassic(channel) {
const res = await qu.getAll(channel.url, '#owl-upcomingScenes .updateItem');
if (res.ok) {
return scrapeAllClassic(res.items, channel);
}
return res.status;
}
async function fetchLatestClassic(channel, page, options, preData) {
return fetchLatest(channel, page, options, preData, scrapeAllClassic);
}
async function fetchLatestTubular(channel, page, options, preData) {
return fetchLatest(channel, page, options, preData, scrapeAllTubular);
}
async function fetchProfile({ name: actorName, url }, { entity, parameters }) {
const actorSlugA = slugify(actorName, '');
const actorSlugB = slugify(actorName, '-');
if (!url && !parameters?.profile && !entity.url) {
return null;
}
const urls = Array.from(new Set([
url,
entity.parameters?.profile ? format(entity.parameters.profile, { actorSlug: actorSlugA }) : `${entity.url}/models/${actorSlugA}.html`,
entity.parameters?.profile ? format(entity.parameters.profile, { actorSlug: actorSlugB }) : `${entity.url}/models/${actorSlugB}.html`,
]));
return urls.reduce(async (chain, profileUrl) => {
const profile = await chain;
if (profile) {
return profile;
}
if (!profileUrl) {
return null;
}
const res = await qu.get(profileUrl);
if (res.statusCode === 200) {
return scrapeProfile(res.item, entity, parameters);
}
return null;
}, Promise.resolve());
}
module.exports = {
classic: {
fetchLatest: fetchLatestClassic,
fetchUpcoming: fetchUpcomingClassic,
fetchProfile,
scrapeAll: scrapeAllClassic,
scrapeScene: scrapeSceneClassic,
},
tubular: {
fetchLatest: fetchLatestTubular,
fetchProfile,
scrapeAll: scrapeAllTubular,
scrapeScene: scrapeSceneTubular,
},
};

View File

@@ -1,206 +0,0 @@
'use strict';
const qu = require('../utils/qu');
const slugify = require('../utils/slugify');
const { feetInchesToCm } = require('../utils/convert');
function scrapeLatestBlog(scenes, channel) {
return scenes.map(({ query }) => {
const release = {};
release.url = query.url('a.more:not([href*="/join.php"])', 'href', { origin: channel.url });
if (release.url) {
release.entryId = new URL(release.url).pathname.match(/\/scene\/(\d+)\/(\d+)/).slice(1, 3).join('-');
} else {
release.entryId = query.img('.bigthumb').match(/\/scenes\/(\w+)/)?.[1];
}
release.title = query.q('h5 strong', true)?.match(/. - (.+)$/)[1] || query.text('.videos h3');
release.description = query.text('p');
release.date = query.date('h5 strong, .videos h3', 'MMM. DD, YYYY', /\w+. \d{2}, \d{4}/);
// remove common patterns so only the name is left
const curatedTitle = release.title.replace(/\b(part \d|\banal|bts)\b/gi, '').trim();
if (!/\band\b/.test(curatedTitle) && new RegExp(curatedTitle).test(release.description)) {
// scene title is probably the actor name
release.actors = [release.title];
}
release.poster = query.img('.bigthumb', null, { origin: channel.url });
release.photos = query.imgs('.smallthumb', null, { origin: channel.url });
release.tags = query.all('a[href*="/keywords"]', true);
return release;
});
}
function scrapeAll(scenes, channel) {
return scenes.map(({ query }) => {
const release = {};
release.url = query.url('.updateInfo h5 a:not([href*="content/"]):not([href*="#coming"])');
release.entryId = query.url('.updateThumb img', 'alt');
release.title = query.q('.updateInfo h5 a', true);
release.actors = query.all('.tour_update_models a', true);
release.date = query.date('.availdate, .updateInfo p span:nth-child(2)', 'MM/DD/YYYY');
release.poster = query.img('.updateThumb img');
const trailer = query.q('.updateInfo h5 a', 'onclick')?.match(/'(.+)'/)?.[1];
if (trailer) {
release.trailer = {
src: `${channel.url}${trailer}`,
};
}
return release;
});
}
function scrapeSceneBlog({ query }, url, channel) {
const release = {};
release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)\/(\d+)/).slice(1, 3).join('-');
release.title = query.text('h4 strong, .videos h3');
release.description = query.q('#about p, .videos p', true);
const actors = query.urls('a[href*="/girl/"]').map(actorUrl => actorUrl.match(/video-([\w\s]+)/)?.[1]).filter(Boolean);
if (actors.length > 0) {
release.actors = actors;
} else {
// release.actors = [query.q('.previewmed h5 strong', true)?.match(/^([\w\s]+),/)?.[0] || query.q('.videos h3', true)].filter(Boolean);
release.actors = [release.title];
}
release.tags = query.all('.info a[href*="/keywords"], .buttons a[href*="/keywords"]', true);
release.poster = query.img('#info .main-preview, .bigthumb', null, { origin: channel.url });
release.photos = [query.img('.previewmed img', null, { origin: channel.url })].concat(query.imgs('.hd-clip img, .smallthumb', null, { origin: channel.url })).filter(photo => photo);
return release;
}
function scrapeScene({ query, html }, url, channel) {
const release = {};
release.title = query.q('.updatesBlock h2', true);
release.poster = query.meta('property="og:image"');
release.entryId = release.poster.match(/\/content\/(.*)\//)?.[1];
const trailer = html.match(/src="(.+\.mp4)"/)?.[1];
if (trailer) {
release.trailer = {
src: `${channel.url}${trailer}`,
};
}
return release;
}
function scrapeProfile({ query }, entity) {
const profile = {};
const bio = query.cnts('.info p').reduce((acc, info) => {
const [key, value] = info.match(/(\w+):\s*(.*)/).slice(1);
return { ...acc, [slugify(key, '_')]: value };
}, {});
profile.age = Number(bio.age);
profile.height = feetInchesToCm(bio.height);
profile.eyes = bio.eyes || bio.eyecolor;
if (bio.figure || bio.measurements) {
const [bust, cup, waist, hip] = (bio.figure || bio.measurements)?.match(/(\d+)(\w+)-(\d+)-(\d+)/).slice(1);
profile.bust = Number(bust);
profile.cup = cup;
profile.waist = Number(waist);
profile.hip = Number(hip);
}
profile.avatar = query.img('img.main-preview', 'src', { origin: entity.url });
return profile;
}
async function fetchLatestBlog(channel, page) {
const url = `${channel.url}/free/updates/videos/${(page - 1) * 10}`;
const res = await qu.getAll(url, '.videos');
return res.ok ? scrapeLatestBlog(res.items, channel) : res.status;
}
async function fetchLatest(channel, page = 1) {
if (channel.parameters?.blog) {
return fetchLatestBlog(channel, page);
}
const url = `${channel.url}/categories/Movies_${page}_d.html`;
const res = await qu.getAll(url, '.bodyArea .updateItem');
return res.ok ? scrapeAll(res.items, channel) : res.status;
}
async function fetchUpcoming(channel) {
if (channel.parameters?.blog) {
return [];
}
const res = await qu.getAll(channel.url, '#owl-upcomingScenes .updateItem');
return res.ok ? scrapeAll(res.items, channel) : res.status;
}
async function fetchScene(url, channel) {
const res = await qu.get(url);
if (res.ok) {
if (channel.parameters?.blog) {
return scrapeSceneBlog(res.item, url, channel);
}
return scrapeScene(res.item, url, channel);
}
return res.status;
}
async function fetchProfile(baseActor, entity) {
const modelsRes = await qu.getAll(`${entity.url}/free/girls.php?alpha=${baseActor.name.slice(0, 1)}`, '.model');
console.log(baseActor);
if (modelsRes.ok) {
const models = modelsRes.items.filter(({ query }) => query.cnt('strong') === baseActor.name);
return Promise.all(models.map(async (model) => {
const modelUrl = model.query.url('a', 'href', { origin: entity.url });
const modelRes = await qu.get(modelUrl);
if (modelRes.ok) {
return scrapeProfile(modelRes.item, entity);
}
return modelRes.status;
}));
}
return modelsRes.status;
}
module.exports = {
fetchLatest,
fetchScene,
fetchUpcoming,
fetchProfile,
};

View File

@@ -368,4 +368,5 @@ module.exports = {
fetchLatest,
fetchScene,
fetchProfile,
scrapeAllT1,
};

View File

@@ -18,7 +18,7 @@ const dorcel = require('./dorcel');
const elegantangel = require('./elegantangel');
const famedigital = require('./famedigital');
const firstanalquest = require('./firstanalquest');
const fcuk = require('./fcuk');
const elevatedx = require('./elevatedx');
const fullpornnetwork = require('./fullpornnetwork');
const gamma = require('./gamma');
const hitzefrei = require('./hitzefrei');
@@ -88,7 +88,7 @@ const scrapers = {
dorcel,
elegantangel,
famedigital,
fcuk,
exploitedx: elevatedx,
firstanalquest,
forbondage: porndoe,
fullpornnetwork,
@@ -117,6 +117,7 @@ const scrapers = {
mikeadriano,
mindgeek,
naughtyamerica,
nebraskacoeds: elevatedx,
newsensations,
nubiles,
pascalssubsluts,
@@ -158,6 +159,7 @@ const scrapers = {
babes: mindgeek,
babepedia,
babevr: badoink,
backroomcastingcouch: elevatedx,
baddaddypov: fullpornnetwork,
badoinkvr: badoink,
bamvisions,
@@ -165,6 +167,7 @@ const scrapers = {
bangbros,
blacked: vixen,
blackedraw: vixen,
blackambush: elevatedx,
blowpass,
boobpedia,
brattysis: nubiles,
@@ -181,7 +184,7 @@ const scrapers = {
dtfsluts: fullpornnetwork,
elegantangel,
evilangel: gamma,
exploitedcollegegirls: fcuk,
exploitedcollegegirls: elevatedx,
eyeontheguy: hush,
fakehub: mindgeek,
firstanalquest,
@@ -220,6 +223,7 @@ const scrapers = {
mofos: mindgeek,
mugfucked: fullpornnetwork,
naughtyamerica,
nebraskacoeds: elevatedx,
nfbusty: nubiles,
nubilefilms: nubiles,
nubiles,