Refactored Hookup Hotshot scraper.
This commit is contained in:
parent
87afb4abe3
commit
ccc6d1c10c
|
@ -1,99 +1,105 @@
|
|||
'use strict';
|
||||
|
||||
const qu = require('../utils/q');
|
||||
const slugify = require('../utils/slugify');
|
||||
const unprint = require('unprint');
|
||||
|
||||
function scrapeAll(scenes) {
|
||||
function scrapeAll(scenes, _channel) {
|
||||
return scenes.map(({ query }) => {
|
||||
const release = {};
|
||||
|
||||
release.url = query.url('.date-title a');
|
||||
release.url = query.url('.item-thumb a, .item-info h4 a');
|
||||
release.entryId = new URL(release.url).pathname.match(/\/trailers\/(.*)\.html/)[1].toLowerCase();
|
||||
|
||||
const avatarEl = query.el('.girl-thumb-container img');
|
||||
release.actors = query.all('.date-starring a').map((actorEl) => {
|
||||
const name = query.cnt(actorEl);
|
||||
release.title = query.content('.item-info h4 a, .item-info a[title]');
|
||||
release.date = query.date('.date', 'YYYY-MM-DD');
|
||||
release.duration = query.duration('.time');
|
||||
|
||||
return {
|
||||
name,
|
||||
gender: 'female',
|
||||
url: query.url(actorEl, null),
|
||||
...(new RegExp(name).test(avatarEl.alt) && {
|
||||
avatar: [
|
||||
avatarEl.src.replace(/-\d+x\d+/, ''),
|
||||
avatarEl.src,
|
||||
].map((src) => ({ src, interval: 1000, concurrency: 1 })),
|
||||
}),
|
||||
};
|
||||
}).concat({
|
||||
name: 'Bryan Gozzling',
|
||||
gender: 'male',
|
||||
});
|
||||
release.photoCount = query.number('.time');
|
||||
|
||||
release.duration = query.dur('.date-facts');
|
||||
release.stars = query.number('[data-rating]', null, 'data-rating');
|
||||
const photoCount = query.number('.item-thumb img.mainThumb', { attribute: 'cnt' });
|
||||
|
||||
const photoCount = query.number('input[id*=count]', null, 'value');
|
||||
const photoPath = query.url('input[id*=baseurl]', 'value');
|
||||
|
||||
release.poster = {
|
||||
src: query.img('.date-img-swap'),
|
||||
interval: 1000,
|
||||
concurrency: 1,
|
||||
};
|
||||
|
||||
release.photos = [...Array(photoCount)].map((value, index) => ({
|
||||
src: `${photoPath}/${String(index + 1).padStart(2, '0')}.jpg`,
|
||||
interval: 1000,
|
||||
concurrency: 1,
|
||||
}));
|
||||
|
||||
// dates appear to be manually curated
|
||||
const fullTitle = query.cnt('.date-title a');
|
||||
const [monthName, date, title] = fullTitle.match(/(\w+)\.? (\d+)\s*-?\s*(.*)/)?.slice(1) || [];
|
||||
const [year, month] = release.poster.src.match(/uploads\/(\d+)\/(\d+)/)?.slice(1) || [];
|
||||
|
||||
release.title = title.replace(/behind the\.\.\./i, 'Behind the Scenes');
|
||||
release.date = qu.extractDate(`${year}-${monthName || month}-${date}`, ['YYYY-MM-DD', 'YYYY-MMM-DD', 'YYYY-MMMM-DD']);
|
||||
|
||||
// release.entryId = new URL(release.url).pathname.split('/')[2];
|
||||
release.entryId = `${release.date.getFullYear()}-${release.date.getMonth() + 1}-${release.date.getDate()}-${slugify(release.actors[0].name)}`;
|
||||
|
||||
release.tags = ['rough', ...release.title.match(/behind the scenes|anal/gi) || []];
|
||||
if (photoCount) {
|
||||
[release.poster, ...release.photos] = Array.from({ length: photoCount }, (value, index) => [
|
||||
query.img('.item-thumb img.mainThumb', { attribute: `src${index}_2x` }),
|
||||
query.img('.item-thumb img.mainThumb', { attribute: `src${index}_3x` }), // 3x is too big and usually inflated, try 2x first
|
||||
query.img('.item-thumb img.mainThumb', { attribute: `src${index}_1x` }),
|
||||
]);
|
||||
}
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
|
||||
function scrapeScene({ query, html }, { url, entity, baseRelease }) {
|
||||
const release = {};
|
||||
|
||||
release.entryId = new URL(url).pathname.match(/\/trailers\/(.*)\.html/)[1].toLowerCase();
|
||||
|
||||
release.title = query.content('.videoDetails h3');
|
||||
release.description = query.content('.videoDetails p');
|
||||
|
||||
release.date = query.date('.videoInfo', 'MMMM D, YYYY');
|
||||
release.duration = query.duration('.videoInfo');
|
||||
|
||||
release.actors = query.all('.update_models a').map((actorEl) => ({
|
||||
name: unprint.query.content(actorEl),
|
||||
url: unprint.query.url(actorEl, null),
|
||||
}));
|
||||
|
||||
release.tags = query.contents('.featuring a[href*="categories/"]');
|
||||
release.photoCount = query.number('.videoInfo', { match: /(\d+) photos/i, matchIndex: 1 });
|
||||
|
||||
release.trailer = unprint.prefixUrl(html.match(/src="(\/trailers\/.*\.mp4)"/)?.[1], entity.url);
|
||||
|
||||
const posterUrl = unprint.prefixUrl(html.match(/poster="(\/content\/.*\.jpg)"/)?.[1], entity.url);
|
||||
|
||||
const posterFallbacks = [
|
||||
posterUrl.replace('-1x', '-2x'),
|
||||
posterUrl.replace('-1x', '-3x'),
|
||||
posterUrl,
|
||||
];
|
||||
|
||||
// scene page poster does not appear on update page
|
||||
if (baseRelease?.poster) {
|
||||
release.photos = [posterFallbacks, ...(baseRelease.photos || [])];
|
||||
} else {
|
||||
release.poster = posterFallbacks;
|
||||
}
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
function scrapeProfile({ query }) {
|
||||
const profile = {};
|
||||
|
||||
profile.gender = 'female';
|
||||
profile.description = query.content('.profile-about') || null;
|
||||
|
||||
profile.description = query.cnts('.girl-about p:not(.bio-facts)').join(' ');
|
||||
profile.avatar = query.img('.girl-pic');
|
||||
|
||||
// no deep scraping available, and not all scene details available here
|
||||
profile.avatar = [
|
||||
query.img('.profile-pic img', { attribute: 'src0_1x' }),
|
||||
// too big, not desirable unless 1x fails
|
||||
query.img('.profile-pic img', { attribute: 'src0_2x' }),
|
||||
query.img('.profile-pic img', { attribute: 'src0_3x' }),
|
||||
];
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page = 1) {
|
||||
const url = `${channel.url}/the-dates/page/${page}`;
|
||||
const res = await qu.getAll(url, '#et-projects li');
|
||||
const url = `${channel.url}/categories/movies/${page}/latest/`;
|
||||
const res = await unprint.get(url, { selectAll: '.items .item-video' });
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.items, channel);
|
||||
return scrapeAll(res.context, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchProfile({ name: actorName }, entity, include) {
|
||||
const url = `${entity.url}/girls/${slugify(actorName)}`;
|
||||
const res = await qu.get(url);
|
||||
async function fetchProfile(actor, entity) {
|
||||
const url = actor.url || `${entity.url}/models/${actor.slug}.html`;
|
||||
const res = await unprint.get(url);
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeProfile(res.item, actorName, entity, include);
|
||||
return scrapeProfile(res.context);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
|
@ -102,4 +108,5 @@ async function fetchProfile({ name: actorName }, entity, include) {
|
|||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchProfile,
|
||||
scrapeScene,
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue