diff --git a/seeds/02_sites.js b/seeds/02_sites.js index 454e7280..463ba12b 100644 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -3070,7 +3070,7 @@ const sites = [ { slug: 'interracialpovs', name: 'Interracial POVs', - url: 'https://www.interracialpovs.com', + url: 'https://interracialpovs.com', tags: ['interracial', 'pov'], parent: 'hussiepass', }, diff --git a/src/actors.js b/src/actors.js index 72e9241a..7b1d8f0b 100644 --- a/src/actors.js +++ b/src/actors.js @@ -396,7 +396,7 @@ async function curateProfile(profile) { }).filter(Boolean) : []; - curatedProfile.releases = toBaseReleases(profile.releases); + curatedProfile.releases = toBaseReleases(profile.releases, profile.entity); if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`); if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`); diff --git a/src/deep.js b/src/deep.js index 2b059ff6..bc9deb54 100644 --- a/src/deep.js +++ b/src/deep.js @@ -46,7 +46,7 @@ async function findEntities(baseReleases) { return entitiesBySlug; } -function toBaseReleases(baseReleasesOrUrls) { +function toBaseReleases(baseReleasesOrUrls, entity = null) { if (!baseReleasesOrUrls) { return []; } @@ -57,6 +57,7 @@ function toBaseReleases(baseReleasesOrUrls) { // base release with URL return { ...baseReleaseOrUrl, + entity, deep: false, }; } @@ -65,6 +66,7 @@ function toBaseReleases(baseReleasesOrUrls) { // URL return { url: baseReleaseOrUrl, + entity, deep: false, }; } @@ -73,6 +75,7 @@ function toBaseReleases(baseReleasesOrUrls) { // base release without URL, prepare for passthrough return { ...baseReleaseOrUrl, + entity, deep: false, }; } diff --git a/src/scrapers/hush.js b/src/scrapers/hush.js index 55b5082a..4a705162 100644 --- a/src/scrapers/hush.js +++ b/src/scrapers/hush.js @@ -2,7 +2,7 @@ const util = require('util'); -const { get, getAll, ed, formatDate, prefixUrl, ctxa } = require('../utils/q'); +const qu = require('../utils/q'); const slugify = require('../utils/slugify'); const { feetInchesToCm, inchesToCm } = require('../utils/convert'); @@ -10,11 +10,11 @@ function deriveEntryId(release) { if (release.date && release.url) { const slug = new URL(release.url).pathname.match(/\/trailers\/(.*).html/)[1]; - return `${slugify(formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(slug)}`; + return `${slugify(qu.formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(slug)}`; } if (release.date && release.title) { - return `${slugify(formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`; + return `${slugify(qu.formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`; } return null; @@ -82,18 +82,18 @@ function scrapeAll(scenes, channel) { } function scrapeAllT1(scenes, site, accNetworkReleases) { - return scenes.map(({ qu }) => { + return scenes.map(({ query }) => { const release = {}; - release.title = qu.q('h4 a', 'title') || qu.q('h4 a', true); - release.url = qu.url('h4 a'); + release.title = query.q('h4 a', 'title') || query.q('h4 a', true); + release.url = query.url('h4 a'); - release.date = qu.date('.more-info-div', 'MMM D, YYYY'); - release.duration = qu.dur('.more-info-div'); + release.date = query.date('.more-info-div', 'MMM D, YYYY'); + release.duration = query.dur('.more-info-div'); if (/bts|behind the scenes/i.test(release.title)) release.tags = ['behind the scenes']; - const posterPath = qu.q('.img-div img', 'src0_1x') || qu.img('img.video_placeholder'); + const posterPath = query.q('.img-div img', 'src0_1x') || query.img('img.video_placeholder'); if (posterPath) { const poster = /^http/.test(posterPath) ? posterPath : `${site.parameters?.media || site.url}${posterPath}`; @@ -109,7 +109,7 @@ function scrapeAllT1(scenes, site, accNetworkReleases) { release.entryId = deriveEntryId(release); if (site.parameters?.accFilter && accNetworkReleases?.map(accRelease => accRelease.entryId).includes(release.entryId)) { - // filter out releases that were already scraped from a categorized site, requires sequential site scraping + // filter out releases that were already scraped from a categorized site, requeryires sequeryential site scraping return null; } @@ -129,10 +129,10 @@ function scrapeScene({ html, query }, channel, url) { release.actors = query.cnts('.update_models a'); const posterPath = html.match(/poster="([\w-/.]+)"/)?.[1]; - const poster = prefixUrl(posterPath, channel.url); + const poster = qu.prefixUrl(posterPath, channel.url) || query.img('.update_thumb', 'src0_1x', { origin: channel.url }); // latter used when trailer requires signup [release.poster, ...release.photos] = [poster, ...query.imgs('.item-thumb img', 'src0_1x', { origin: channel.url })] - .map(src => [ + .map(src => src && [ src.replace('-1x', '-3x'), src.replace('-1x', '-2x'), src, @@ -141,8 +141,7 @@ function scrapeScene({ html, query }, channel, url) { const trailerPath = html.match(/\/trailers\/.*.mp4/); if (trailerPath) { - // release.trailer = { src: `${channel.parameters?.media || channel.url}${trailerPath}` }; - release.trailer = prefixUrl(trailerPath, channel.parameters?.media || channel.url); + release.trailer = qu.prefixUrl(trailerPath, channel.parameters?.media || channel.url); } release.tags = query.cnts('.featuring a[href*="categories/"]'); @@ -153,31 +152,31 @@ function scrapeScene({ html, query }, channel, url) { return release; } -function scrapeSceneT1({ html, qu }, site, url, baseRelease) { +function scrapeSceneT1({ html, query }, site, url, baseRelease) { const release = { url }; - release.title = qu.q('.trailer-section-head .section-title', true); - release.description = qu.text('.row .update-info-block'); + release.title = query.q('.trailer-section-head .section-title', true); + release.description = query.text('.row .update-info-block'); - release.date = qu.date('.update-info-row', 'MMM D, YYYY', /\w+ \d{1,2}, \d{4}/); - release.duration = qu.dur('.update-info-row:nth-child(2)'); + release.date = query.date('.update-info-row', 'MMM D, YYYY', /\w+ \d{1,2}, \d{4}/); + release.duration = query.dur('.update-info-row:nth-child(2)'); - release.actors = qu.all('.models-list-thumbs a').map(el => ({ - name: qu.q(el, 'span', true), - avatar: getImageWithFallbacks(qu.q, 'img', site, el), + release.actors = query.all('.models-list-thumbs a').map(el => ({ + name: query.q(el, 'span', true), + avatar: getImageWithFallbacks(query.q, 'img', site, el), })); - release.tags = qu.all('.tags a', true); + release.tags = query.all('.tags a', true); // const posterPath = html.match(/poster="(.*\.jpg)/)?.[1]; - const posterPath = qu.q('.player-thumb img', 'src0_1x'); + const posterPath = query.q('.player-thumb img', 'src0_1x'); [release.poster, release.photos] = extractPoster(posterPath, site, baseRelease); const trailer = html.match(/ { + const bio = query.all('.detail-div + .detail-div p, .detail-div p', true).reduce((acc, info) => { const [key, value] = info.split(':'); if (!value) return acc; @@ -225,15 +224,30 @@ function scrapeProfileT1({ el, qu }, site) { if (heightMetric) profile.height = Number(heightMetric[1]); if (heightImperial) profile.height = feetInchesToCm(Number(heightImperial[0]), Number(heightImperial[1])); - profile.avatar = getImageWithFallbacks(qu.q, '.img-div img', site); + profile.avatar = getImageWithFallbacks(query.q, '.img-div img', site); - const qReleases = ctxa(el, '.item-video'); + const qReleases = qu.initAll(el, '.item-video'); profile.releases = scrapeAllT1(qReleases, site); return profile; } -function scrapeProfile({ query }, channel) { +async function fetchActorScenes({ query, el }, channel, accScenes = []) { + const scenes = scrapeAll(qu.initAll(el, '.item-video'), channel); + const nextPage = query.url('.next a'); + + if (nextPage) { + const res = await qu.get(nextPage); + + if (res.ok) { + return fetchActorScenes(res.item, channel, scenes.concat(accScenes)); + } + } + + return accScenes.concat(scenes); +} + +async function scrapeProfile({ query, el }, channel, options) { const profile = {}; const bio = query.all('.stats li').reduce((acc, bioEl) => { @@ -246,7 +260,7 @@ function scrapeProfile({ query }, channel) { }; }, {}); - if (bio.date_of_birth) profile.birthdate = ed(bio.date_of_birth, 'MMMM D, YYYY'); + if (bio.date_of_birth) profile.birthdate = qu.extractDate(bio.date_of_birth, 'MMMM D, YYYY'); if (bio.birthplace) profile.birthPlace = bio.birthplace; if (bio.fun_fact) profile.description = bio.fun_fact; @@ -286,6 +300,10 @@ function scrapeProfile({ query }, channel) { query.img('.profile-pic img', 'src0_1x', { origin: channel.url }), ]; + if (options.includeActorScenes) { + profile.releases = await fetchActorScenes({ query, el }, channel); + } + return profile; } @@ -294,7 +312,7 @@ async function fetchLatest(site, page = 1, include, { uniqueReleases = [], dupli || (site.parameters?.t1 && `${site.url}/t1/categories/movies_${page}_d.html`) || `${site.url}/categories/movies_${page}_d.html`; - const res = await getAll(url, '.modelfeature, .item-video, .updateItem'); + const res = await qu.getAll(url, '.modelfeature, .item-video, .updateItem'); if (!res.ok) { return res.status; @@ -308,7 +326,7 @@ async function fetchLatest(site, page = 1, include, { uniqueReleases = [], dupli } async function fetchScene(url, site, baseRelease) { - const res = await get(url); + const res = await qu.get(url); if (!res.ok) { return res.status; @@ -321,19 +339,19 @@ async function fetchScene(url, site, baseRelease) { return scrapeScene(res.item, site, url, baseRelease); } -async function fetchProfile({ name: actorName }, { site }) { +async function fetchProfile({ name: actorName }, { site }, options) { const actorSlugA = slugify(actorName, ''); const actorSlugB = slugify(actorName); const t1 = site.parameters?.t1 ? 't1/' : ''; const res1 = site.parameters?.profile - ? await get(util.format(site.parameters.profile, actorSlugA)) - : await get(`${site.url}/${t1}models/${actorSlugA}.html`, null, null, { followRedirects: false }); + ? await qu.get(util.format(site.parameters.profile, actorSlugA)) + : await qu.get(`${site.url}/${t1}models/${actorSlugA}.html`, null, null, { followRedirects: false }); const res = (res1.ok && res1) - || (site.parameters?.profile && await get(util.format(site.parameters.profile, actorSlugB))) - || await get(`${site.url}/${t1}models/${actorSlugB}.html`, null, null, { followRedirects: false }); + || (site.parameters?.profile && await qu.get(util.format(site.parameters.profile, actorSlugB))) + || await qu.get(`${site.url}/${t1}models/${actorSlugB}.html`, null, null, { followRedirects: false }); if (!res.ok) { return res.status; @@ -343,7 +361,7 @@ async function fetchProfile({ name: actorName }, { site }) { return scrapeProfileT1(res.item, site); } - return scrapeProfile(res.item, site); + return scrapeProfile(res.item, site, options); } module.exports = { diff --git a/src/utils/argv-include.js b/src/utils/argv-include.js index 7c7853ba..dd24854a 100644 --- a/src/utils/argv-include.js +++ b/src/utils/argv-include.js @@ -15,18 +15,18 @@ function include(argv) { return { ...options, // legacy - covers: include.includeCovers, - media: include.includeMedia, - photos: include.includePhotos, - videos: include.includeVideos, - poster: include.includePosters, - posters: include.includePosters, - teaser: include.includeTeasers, - teasers: include.includeTeasers, - trailer: include.includeTrailers, - trailers: include.includeTrailers, - releases: include.includeActorScenes, - scenes: include.includeActorScenes, + covers: options.includeCovers, + media: options.includeMedia, + photos: options.includePhotos, + videos: options.includeVideos, + poster: options.includePosters, + posters: options.includePosters, + teaser: options.includeTeasers, + teasers: options.includeTeasers, + trailer: options.includeTrailers, + trailers: options.includeTrailers, + releases: options.includeActorScenes, + scenes: options.includeActorScenes, }; }