Scraping actor scenes from Hussie Pass. Adding entity to actor base releases.
This commit is contained in:
parent
b952b758d7
commit
d14ef90136
|
@ -3070,7 +3070,7 @@ const sites = [
|
||||||
{
|
{
|
||||||
slug: 'interracialpovs',
|
slug: 'interracialpovs',
|
||||||
name: 'Interracial POVs',
|
name: 'Interracial POVs',
|
||||||
url: 'https://www.interracialpovs.com',
|
url: 'https://interracialpovs.com',
|
||||||
tags: ['interracial', 'pov'],
|
tags: ['interracial', 'pov'],
|
||||||
parent: 'hussiepass',
|
parent: 'hussiepass',
|
||||||
},
|
},
|
||||||
|
|
|
@ -396,7 +396,7 @@ async function curateProfile(profile) {
|
||||||
}).filter(Boolean)
|
}).filter(Boolean)
|
||||||
: [];
|
: [];
|
||||||
|
|
||||||
curatedProfile.releases = toBaseReleases(profile.releases);
|
curatedProfile.releases = toBaseReleases(profile.releases, profile.entity);
|
||||||
|
|
||||||
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`);
|
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`);
|
||||||
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`);
|
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`);
|
||||||
|
|
|
@ -46,7 +46,7 @@ async function findEntities(baseReleases) {
|
||||||
return entitiesBySlug;
|
return entitiesBySlug;
|
||||||
}
|
}
|
||||||
|
|
||||||
function toBaseReleases(baseReleasesOrUrls) {
|
function toBaseReleases(baseReleasesOrUrls, entity = null) {
|
||||||
if (!baseReleasesOrUrls) {
|
if (!baseReleasesOrUrls) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
@ -57,6 +57,7 @@ function toBaseReleases(baseReleasesOrUrls) {
|
||||||
// base release with URL
|
// base release with URL
|
||||||
return {
|
return {
|
||||||
...baseReleaseOrUrl,
|
...baseReleaseOrUrl,
|
||||||
|
entity,
|
||||||
deep: false,
|
deep: false,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -65,6 +66,7 @@ function toBaseReleases(baseReleasesOrUrls) {
|
||||||
// URL
|
// URL
|
||||||
return {
|
return {
|
||||||
url: baseReleaseOrUrl,
|
url: baseReleaseOrUrl,
|
||||||
|
entity,
|
||||||
deep: false,
|
deep: false,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -73,6 +75,7 @@ function toBaseReleases(baseReleasesOrUrls) {
|
||||||
// base release without URL, prepare for passthrough
|
// base release without URL, prepare for passthrough
|
||||||
return {
|
return {
|
||||||
...baseReleaseOrUrl,
|
...baseReleaseOrUrl,
|
||||||
|
entity,
|
||||||
deep: false,
|
deep: false,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
const util = require('util');
|
const util = require('util');
|
||||||
|
|
||||||
const { get, getAll, ed, formatDate, prefixUrl, ctxa } = require('../utils/q');
|
const qu = require('../utils/q');
|
||||||
const slugify = require('../utils/slugify');
|
const slugify = require('../utils/slugify');
|
||||||
const { feetInchesToCm, inchesToCm } = require('../utils/convert');
|
const { feetInchesToCm, inchesToCm } = require('../utils/convert');
|
||||||
|
|
||||||
|
@ -10,11 +10,11 @@ function deriveEntryId(release) {
|
||||||
if (release.date && release.url) {
|
if (release.date && release.url) {
|
||||||
const slug = new URL(release.url).pathname.match(/\/trailers\/(.*).html/)[1];
|
const slug = new URL(release.url).pathname.match(/\/trailers\/(.*).html/)[1];
|
||||||
|
|
||||||
return `${slugify(formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(slug)}`;
|
return `${slugify(qu.formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(slug)}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (release.date && release.title) {
|
if (release.date && release.title) {
|
||||||
return `${slugify(formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`;
|
return `${slugify(qu.formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
|
@ -82,18 +82,18 @@ function scrapeAll(scenes, channel) {
|
||||||
}
|
}
|
||||||
|
|
||||||
function scrapeAllT1(scenes, site, accNetworkReleases) {
|
function scrapeAllT1(scenes, site, accNetworkReleases) {
|
||||||
return scenes.map(({ qu }) => {
|
return scenes.map(({ query }) => {
|
||||||
const release = {};
|
const release = {};
|
||||||
|
|
||||||
release.title = qu.q('h4 a', 'title') || qu.q('h4 a', true);
|
release.title = query.q('h4 a', 'title') || query.q('h4 a', true);
|
||||||
release.url = qu.url('h4 a');
|
release.url = query.url('h4 a');
|
||||||
|
|
||||||
release.date = qu.date('.more-info-div', 'MMM D, YYYY');
|
release.date = query.date('.more-info-div', 'MMM D, YYYY');
|
||||||
release.duration = qu.dur('.more-info-div');
|
release.duration = query.dur('.more-info-div');
|
||||||
|
|
||||||
if (/bts|behind the scenes/i.test(release.title)) release.tags = ['behind the scenes'];
|
if (/bts|behind the scenes/i.test(release.title)) release.tags = ['behind the scenes'];
|
||||||
|
|
||||||
const posterPath = qu.q('.img-div img', 'src0_1x') || qu.img('img.video_placeholder');
|
const posterPath = query.q('.img-div img', 'src0_1x') || query.img('img.video_placeholder');
|
||||||
|
|
||||||
if (posterPath) {
|
if (posterPath) {
|
||||||
const poster = /^http/.test(posterPath) ? posterPath : `${site.parameters?.media || site.url}${posterPath}`;
|
const poster = /^http/.test(posterPath) ? posterPath : `${site.parameters?.media || site.url}${posterPath}`;
|
||||||
|
@ -109,7 +109,7 @@ function scrapeAllT1(scenes, site, accNetworkReleases) {
|
||||||
release.entryId = deriveEntryId(release);
|
release.entryId = deriveEntryId(release);
|
||||||
|
|
||||||
if (site.parameters?.accFilter && accNetworkReleases?.map(accRelease => accRelease.entryId).includes(release.entryId)) {
|
if (site.parameters?.accFilter && accNetworkReleases?.map(accRelease => accRelease.entryId).includes(release.entryId)) {
|
||||||
// filter out releases that were already scraped from a categorized site, requires sequential site scraping
|
// filter out releases that were already scraped from a categorized site, requeryires sequeryential site scraping
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -129,10 +129,10 @@ function scrapeScene({ html, query }, channel, url) {
|
||||||
release.actors = query.cnts('.update_models a');
|
release.actors = query.cnts('.update_models a');
|
||||||
|
|
||||||
const posterPath = html.match(/poster="([\w-/.]+)"/)?.[1];
|
const posterPath = html.match(/poster="([\w-/.]+)"/)?.[1];
|
||||||
const poster = prefixUrl(posterPath, channel.url);
|
const poster = qu.prefixUrl(posterPath, channel.url) || query.img('.update_thumb', 'src0_1x', { origin: channel.url }); // latter used when trailer requires signup
|
||||||
|
|
||||||
[release.poster, ...release.photos] = [poster, ...query.imgs('.item-thumb img', 'src0_1x', { origin: channel.url })]
|
[release.poster, ...release.photos] = [poster, ...query.imgs('.item-thumb img', 'src0_1x', { origin: channel.url })]
|
||||||
.map(src => [
|
.map(src => src && [
|
||||||
src.replace('-1x', '-3x'),
|
src.replace('-1x', '-3x'),
|
||||||
src.replace('-1x', '-2x'),
|
src.replace('-1x', '-2x'),
|
||||||
src,
|
src,
|
||||||
|
@ -141,8 +141,7 @@ function scrapeScene({ html, query }, channel, url) {
|
||||||
const trailerPath = html.match(/\/trailers\/.*.mp4/);
|
const trailerPath = html.match(/\/trailers\/.*.mp4/);
|
||||||
|
|
||||||
if (trailerPath) {
|
if (trailerPath) {
|
||||||
// release.trailer = { src: `${channel.parameters?.media || channel.url}${trailerPath}` };
|
release.trailer = qu.prefixUrl(trailerPath, channel.parameters?.media || channel.url);
|
||||||
release.trailer = prefixUrl(trailerPath, channel.parameters?.media || channel.url);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
release.tags = query.cnts('.featuring a[href*="categories/"]');
|
release.tags = query.cnts('.featuring a[href*="categories/"]');
|
||||||
|
@ -153,31 +152,31 @@ function scrapeScene({ html, query }, channel, url) {
|
||||||
return release;
|
return release;
|
||||||
}
|
}
|
||||||
|
|
||||||
function scrapeSceneT1({ html, qu }, site, url, baseRelease) {
|
function scrapeSceneT1({ html, query }, site, url, baseRelease) {
|
||||||
const release = { url };
|
const release = { url };
|
||||||
|
|
||||||
release.title = qu.q('.trailer-section-head .section-title', true);
|
release.title = query.q('.trailer-section-head .section-title', true);
|
||||||
release.description = qu.text('.row .update-info-block');
|
release.description = query.text('.row .update-info-block');
|
||||||
|
|
||||||
release.date = qu.date('.update-info-row', 'MMM D, YYYY', /\w+ \d{1,2}, \d{4}/);
|
release.date = query.date('.update-info-row', 'MMM D, YYYY', /\w+ \d{1,2}, \d{4}/);
|
||||||
release.duration = qu.dur('.update-info-row:nth-child(2)');
|
release.duration = query.dur('.update-info-row:nth-child(2)');
|
||||||
|
|
||||||
release.actors = qu.all('.models-list-thumbs a').map(el => ({
|
release.actors = query.all('.models-list-thumbs a').map(el => ({
|
||||||
name: qu.q(el, 'span', true),
|
name: query.q(el, 'span', true),
|
||||||
avatar: getImageWithFallbacks(qu.q, 'img', site, el),
|
avatar: getImageWithFallbacks(query.q, 'img', site, el),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
release.tags = qu.all('.tags a', true);
|
release.tags = query.all('.tags a', true);
|
||||||
|
|
||||||
// const posterPath = html.match(/poster="(.*\.jpg)/)?.[1];
|
// const posterPath = html.match(/poster="(.*\.jpg)/)?.[1];
|
||||||
const posterPath = qu.q('.player-thumb img', 'src0_1x');
|
const posterPath = query.q('.player-thumb img', 'src0_1x');
|
||||||
[release.poster, release.photos] = extractPoster(posterPath, site, baseRelease);
|
[release.poster, release.photos] = extractPoster(posterPath, site, baseRelease);
|
||||||
|
|
||||||
const trailer = html.match(/<video.*src="(.*\.mp4)/)?.[1];
|
const trailer = html.match(/<video.*src="(.*\.mp4)/)?.[1];
|
||||||
if (trailer && /^http/.test(trailer)) release.trailer = { src: trailer, referer: url };
|
if (trailer && /^http/.test(trailer)) release.trailer = { src: trailer, referer: url };
|
||||||
else if (trailer) release.trailer = { src: `${site.parameters?.media || site.url}${trailer}`, referer: url };
|
else if (trailer) release.trailer = { src: `${site.parameters?.media || site.url}${trailer}`, referer: url };
|
||||||
|
|
||||||
const stars = qu.q('.update-rating', true).match(/\d.\d/)?.[0];
|
const stars = query.q('.update-rating', true).match(/\d.\d/)?.[0];
|
||||||
if (stars) release.stars = Number(stars);
|
if (stars) release.stars = Number(stars);
|
||||||
|
|
||||||
if (site.type === 'network') {
|
if (site.type === 'network') {
|
||||||
|
@ -195,10 +194,10 @@ function scrapeSceneT1({ html, qu }, site, url, baseRelease) {
|
||||||
return release;
|
return release;
|
||||||
}
|
}
|
||||||
|
|
||||||
function scrapeProfileT1({ el, qu }, site) {
|
function scrapeProfileT1({ el, query }, site) {
|
||||||
const profile = {};
|
const profile = {};
|
||||||
|
|
||||||
const bio = qu.all('.detail-div + .detail-div p, .detail-div p', true).reduce((acc, info) => {
|
const bio = query.all('.detail-div + .detail-div p, .detail-div p', true).reduce((acc, info) => {
|
||||||
const [key, value] = info.split(':');
|
const [key, value] = info.split(':');
|
||||||
|
|
||||||
if (!value) return acc;
|
if (!value) return acc;
|
||||||
|
@ -225,15 +224,30 @@ function scrapeProfileT1({ el, qu }, site) {
|
||||||
if (heightMetric) profile.height = Number(heightMetric[1]);
|
if (heightMetric) profile.height = Number(heightMetric[1]);
|
||||||
if (heightImperial) profile.height = feetInchesToCm(Number(heightImperial[0]), Number(heightImperial[1]));
|
if (heightImperial) profile.height = feetInchesToCm(Number(heightImperial[0]), Number(heightImperial[1]));
|
||||||
|
|
||||||
profile.avatar = getImageWithFallbacks(qu.q, '.img-div img', site);
|
profile.avatar = getImageWithFallbacks(query.q, '.img-div img', site);
|
||||||
|
|
||||||
const qReleases = ctxa(el, '.item-video');
|
const qReleases = qu.initAll(el, '.item-video');
|
||||||
profile.releases = scrapeAllT1(qReleases, site);
|
profile.releases = scrapeAllT1(qReleases, site);
|
||||||
|
|
||||||
return profile;
|
return profile;
|
||||||
}
|
}
|
||||||
|
|
||||||
function scrapeProfile({ query }, channel) {
|
async function fetchActorScenes({ query, el }, channel, accScenes = []) {
|
||||||
|
const scenes = scrapeAll(qu.initAll(el, '.item-video'), channel);
|
||||||
|
const nextPage = query.url('.next a');
|
||||||
|
|
||||||
|
if (nextPage) {
|
||||||
|
const res = await qu.get(nextPage);
|
||||||
|
|
||||||
|
if (res.ok) {
|
||||||
|
return fetchActorScenes(res.item, channel, scenes.concat(accScenes));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return accScenes.concat(scenes);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapeProfile({ query, el }, channel, options) {
|
||||||
const profile = {};
|
const profile = {};
|
||||||
|
|
||||||
const bio = query.all('.stats li').reduce((acc, bioEl) => {
|
const bio = query.all('.stats li').reduce((acc, bioEl) => {
|
||||||
|
@ -246,7 +260,7 @@ function scrapeProfile({ query }, channel) {
|
||||||
};
|
};
|
||||||
}, {});
|
}, {});
|
||||||
|
|
||||||
if (bio.date_of_birth) profile.birthdate = ed(bio.date_of_birth, 'MMMM D, YYYY');
|
if (bio.date_of_birth) profile.birthdate = qu.extractDate(bio.date_of_birth, 'MMMM D, YYYY');
|
||||||
if (bio.birthplace) profile.birthPlace = bio.birthplace;
|
if (bio.birthplace) profile.birthPlace = bio.birthplace;
|
||||||
if (bio.fun_fact) profile.description = bio.fun_fact;
|
if (bio.fun_fact) profile.description = bio.fun_fact;
|
||||||
|
|
||||||
|
@ -286,6 +300,10 @@ function scrapeProfile({ query }, channel) {
|
||||||
query.img('.profile-pic img', 'src0_1x', { origin: channel.url }),
|
query.img('.profile-pic img', 'src0_1x', { origin: channel.url }),
|
||||||
];
|
];
|
||||||
|
|
||||||
|
if (options.includeActorScenes) {
|
||||||
|
profile.releases = await fetchActorScenes({ query, el }, channel);
|
||||||
|
}
|
||||||
|
|
||||||
return profile;
|
return profile;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -294,7 +312,7 @@ async function fetchLatest(site, page = 1, include, { uniqueReleases = [], dupli
|
||||||
|| (site.parameters?.t1 && `${site.url}/t1/categories/movies_${page}_d.html`)
|
|| (site.parameters?.t1 && `${site.url}/t1/categories/movies_${page}_d.html`)
|
||||||
|| `${site.url}/categories/movies_${page}_d.html`;
|
|| `${site.url}/categories/movies_${page}_d.html`;
|
||||||
|
|
||||||
const res = await getAll(url, '.modelfeature, .item-video, .updateItem');
|
const res = await qu.getAll(url, '.modelfeature, .item-video, .updateItem');
|
||||||
|
|
||||||
if (!res.ok) {
|
if (!res.ok) {
|
||||||
return res.status;
|
return res.status;
|
||||||
|
@ -308,7 +326,7 @@ async function fetchLatest(site, page = 1, include, { uniqueReleases = [], dupli
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchScene(url, site, baseRelease) {
|
async function fetchScene(url, site, baseRelease) {
|
||||||
const res = await get(url);
|
const res = await qu.get(url);
|
||||||
|
|
||||||
if (!res.ok) {
|
if (!res.ok) {
|
||||||
return res.status;
|
return res.status;
|
||||||
|
@ -321,19 +339,19 @@ async function fetchScene(url, site, baseRelease) {
|
||||||
return scrapeScene(res.item, site, url, baseRelease);
|
return scrapeScene(res.item, site, url, baseRelease);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchProfile({ name: actorName }, { site }) {
|
async function fetchProfile({ name: actorName }, { site }, options) {
|
||||||
const actorSlugA = slugify(actorName, '');
|
const actorSlugA = slugify(actorName, '');
|
||||||
const actorSlugB = slugify(actorName);
|
const actorSlugB = slugify(actorName);
|
||||||
|
|
||||||
const t1 = site.parameters?.t1 ? 't1/' : '';
|
const t1 = site.parameters?.t1 ? 't1/' : '';
|
||||||
|
|
||||||
const res1 = site.parameters?.profile
|
const res1 = site.parameters?.profile
|
||||||
? await get(util.format(site.parameters.profile, actorSlugA))
|
? await qu.get(util.format(site.parameters.profile, actorSlugA))
|
||||||
: await get(`${site.url}/${t1}models/${actorSlugA}.html`, null, null, { followRedirects: false });
|
: await qu.get(`${site.url}/${t1}models/${actorSlugA}.html`, null, null, { followRedirects: false });
|
||||||
|
|
||||||
const res = (res1.ok && res1)
|
const res = (res1.ok && res1)
|
||||||
|| (site.parameters?.profile && await get(util.format(site.parameters.profile, actorSlugB)))
|
|| (site.parameters?.profile && await qu.get(util.format(site.parameters.profile, actorSlugB)))
|
||||||
|| await get(`${site.url}/${t1}models/${actorSlugB}.html`, null, null, { followRedirects: false });
|
|| await qu.get(`${site.url}/${t1}models/${actorSlugB}.html`, null, null, { followRedirects: false });
|
||||||
|
|
||||||
if (!res.ok) {
|
if (!res.ok) {
|
||||||
return res.status;
|
return res.status;
|
||||||
|
@ -343,7 +361,7 @@ async function fetchProfile({ name: actorName }, { site }) {
|
||||||
return scrapeProfileT1(res.item, site);
|
return scrapeProfileT1(res.item, site);
|
||||||
}
|
}
|
||||||
|
|
||||||
return scrapeProfile(res.item, site);
|
return scrapeProfile(res.item, site, options);
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
|
|
@ -15,18 +15,18 @@ function include(argv) {
|
||||||
return {
|
return {
|
||||||
...options,
|
...options,
|
||||||
// legacy
|
// legacy
|
||||||
covers: include.includeCovers,
|
covers: options.includeCovers,
|
||||||
media: include.includeMedia,
|
media: options.includeMedia,
|
||||||
photos: include.includePhotos,
|
photos: options.includePhotos,
|
||||||
videos: include.includeVideos,
|
videos: options.includeVideos,
|
||||||
poster: include.includePosters,
|
poster: options.includePosters,
|
||||||
posters: include.includePosters,
|
posters: options.includePosters,
|
||||||
teaser: include.includeTeasers,
|
teaser: options.includeTeasers,
|
||||||
teasers: include.includeTeasers,
|
teasers: options.includeTeasers,
|
||||||
trailer: include.includeTrailers,
|
trailer: options.includeTrailers,
|
||||||
trailers: include.includeTrailers,
|
trailers: options.includeTrailers,
|
||||||
releases: include.includeActorScenes,
|
releases: options.includeActorScenes,
|
||||||
scenes: include.includeActorScenes,
|
scenes: options.includeActorScenes,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue