traxxx/src/scrapers/archangel.js

220 lines
6.3 KiB
JavaScript
Executable File

'use strict';
// ALSO USED BY THE FLOURISH
const unprint = require('unprint');
const slugify = require('../utils/slugify');
const { feetInchesToCm } = require('../utils/convert');
const placeholder = /images\/p\d+\.jpe?g/i;
function getEntryId(release) {
return slugify(new URL(release.url).pathname.match(/\/([\w-]+)\.html/)?.[1]
|| [unprint.formatDate(release.date, 'YYYY-MM-DD'), release.title, ...release.actors]);
}
function scrapeAll(scenes) {
return scenes.map(({ query }) => {
const release = {};
release.url = query.url('a');
release.title = query.content('a span');
release.date = query.date('.timeDate', 'YYYY-MM-DD');
release.duration = query.duration('.timeDate');
release.actors = query.all('a[href*="models/"], a[href*="sets.php"]').map((actorEl) => ({
name: unprint.query.content(actorEl),
url: unprint.query.url(actorEl, null),
}));
const poster = query.img('img.mainThumb');
const previewCount = query.number('img.mainThumb', { attribute: 'cnt' });
if (poster && !placeholder.test(poster)) {
const posterFallbacks = [
poster.replace('-1x', '-3x'),
poster.replace('-1x', '-2x'),
poster.replace('-1x', '-4x'),
poster,
];
release.poster = posterFallbacks;
}
if (previewCount) {
release.photos = Array.from(
{ length: previewCount - 1 },
(value, index) => [3, 2, 4, 1].map((scale) => unprint.prefixUrl(query.img('img.mainThumb', { attribute: `src${index + 1}_${scale}x` }))).filter(Boolean), // 4x is unnecessarily big and possibly upscaled
).filter(Boolean);
}
release.photoCount = query.number('.timeDate', { match: /(\d+) photos/i, matchIndex: 1 });
release.entryId = getEntryId(release);
return release;
});
}
function scrapeScene({ query, html }, { url, entity, baseRelease }) {
const release = { url };
release.title = query.content('.title h2');
release.description = query.content('.description p');
release.date = query.date('.info p', 'MMMM D, YYYY');
release.duration = query.duration('.info p');
release.actors = query.all('.info a[href*="models/"], .info a[href*="sets.php"]').map((actorEl) => ({
name: unprint.query.content(actorEl),
url: unprint.query.url(actorEl, null),
}));
const poster = unprint.prefixUrl(query.img('.update_thumb') || html.match(/poster="(.*\.jpg)"/)?.[1], entity.url);
if (poster && !placeholder.test(poster)) {
const posterFallbacks = [
poster.replace('-1x', '-3x'),
poster.replace('-1x', '-2x'),
poster.replace('-1x', '-4x'),
poster,
];
// scene page poster usually different from overview page, don't replace
if (baseRelease?.poster && baseRelease.poster !== poster) {
release.photos = baseRelease.photos
? [posterFallbacks, ...baseRelease.photos]
: [posterFallbacks];
} else {
release.poster = posterFallbacks;
}
}
const trailer = html.match(/src="(.*\.mp4)"/)?.[1];
if (trailer) {
release.trailer = unprint.prefixUrl(encodeURI(trailer), entity.url);
}
release.tags = query.contents('.info .tags a');
release.photoCount = query.number('.info', { match: /(\d+) photos/i, matchIndex: 1 });
release.entryId = getEntryId(release);
return release;
}
function scrapeMovie({ query, element }, { entity, url }) {
const release = { url };
release.title = query.content('.title h2');
release.description = query.content('.aboutArea p');
release.covers = [[
query.img('.update_thumb', { attribute: 'src0_2x', origin: entity.url }),
query.img('.update_thumb', { attribute: 'src0_1x', origin: entity.url }),
query.img('.update_thumb', { attribute: 'src0', origin: entity.url }),
// usually upscaled
query.img('.update_thumb', { attribute: 'src0_4x', origin: entity.url }),
query.img('.update_thumb', { attribute: 'src0_3x', origin: entity.url }),
].filter(Boolean)];
release.entryId = getEntryId(release);
release.scenes = scrapeAll(unprint.initAll(element, '.item-video'));
return release;
}
function scrapeProfile({ query, element }, { url, entity }) {
const profile = { url };
const bio = Object.fromEntries(query.all('.stats li')
.map((row) => [
slugify(unprint.query.content(row, '.data-name, span'), '_'),
unprint.query.text(row),
])
.filter(([key, value]) => key && value));
profile.description = query.content('.aboutArea p');
profile.birthPlace = bio.place_of_birth;
profile.dateOfBirth = unprint.extractDate(bio.age, 'MMMM D, YYYY');
profile.height = Number(bio.height?.match(/(\d+)\s*cm/)?.[1]) || (/\d fe*t \d+ inch/i.test(bio.height) && feetInchesToCm(bio.height)) || null;
profile.measurements = bio.measurements;
profile.hairColor = bio.hair_color;
profile.eyes = bio.eye_color;
profile.avatar = [
query.img('.model_bio_thumb', { attribute: 'src0_4x', origin: entity.url }),
query.img('.model_bio_thumb', { attribute: 'src0_3x', origin: entity.url }),
query.img('.model_bio_thumb', { attribute: 'src0_2x', origin: entity.url }),
query.img('.model_bio_thumb', { attribute: 'src0_1x', origin: entity.url }),
query.img('.model_bio_thumb', { attribute: 'src0', origin: entity.url }),
].filter((avatar) => avatar && !placeholder.test(avatar));
profile.scenes = scrapeAll(unprint.initAll(element, '.item-video'));
return profile;
}
async function fetchLatest(channel, page = 1, context) {
const url = `${channel.url}${context.parameters.path || ''}/categories/movies_${page}_d.html`;
const res = await unprint.get(url, { selectAll: '.item-video' });
if (res.ok) {
return scrapeAll(res.context, channel);
}
return res.status;
}
async function fetchProfile({ name: actorName, url: actorUrl }, { entity, include, parameters }) {
const res = await [
actorUrl,
`${entity.url}${parameters.path || ''}/models/${slugify(actorName, '-')}.html`,
`${entity.url}${parameters.path || ''}/models/${slugify(actorName, '')}.html`,
].reduce(async (chain, url) => {
const prevRes = await chain;
if (prevRes.ok || !url) {
return prevRes;
}
const actorRes = await unprint.get(url);
if (actorRes.ok) {
return {
...actorRes,
url,
};
}
return prevRes;
}, Promise.resolve({ ok: false, status: null }));
if (res.ok) {
return scrapeProfile(res.context, { entity, include, url: res.url });
}
return res.status;
}
module.exports = {
fetchLatest,
fetchProfile,
scrapeScene: {
scraper: scrapeScene,
unprint: true,
},
scrapeMovie: {
scraper: scrapeMovie,
unprint: true,
},
};