Refactored Bang! scraper to match new website, first use of unprint.

This commit is contained in:
DebaucheryLibrarian
2022-11-27 04:22:58 +01:00
parent 3cf8776ca5
commit 6edd62c337
61 changed files with 2371 additions and 1233 deletions

View File

@@ -2,6 +2,7 @@
const config = require('config');
const util = require('util');
const unprint = require('unprint');
// const log = require('why-is-node-running');
const Inspector = require('inspector-api');
const fs = require('fs').promises;
@@ -25,6 +26,13 @@ const getFileEntries = require('./utils/file-entries');
const inspector = new Inspector();
let done = false;
unprint.options({
timeout: 5000,
headers: {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
},
});
/*
function logActive() {
setTimeout(() => {

View File

@@ -2,6 +2,7 @@
const util = require('util');
const Promise = require('bluebird');
const unprint = require('unprint');
const { mergeAdvanced: merge } = require('object-merge-advanced');
const argv = require('./argv');
@@ -54,12 +55,33 @@ function toBaseReleases(baseReleasesOrUrls, entity = null) {
.filter(Boolean);
}
async function fetchUnprintScene(scraper, url, entity, baseRelease, options, type) {
const res = await unprint.get(url, {
rejectUnauthorized: false,
});
if (res.ok) {
return scraper[type === 'movie' ? 'scrapeMovie' : 'scrapeScene'](res.context, {
url,
entity,
baseRelease,
headers: res.headers,
}, options);
}
return res.status;
}
async function fetchScene(scraper, url, entity, baseRelease, options, type = 'scene') {
if ((type === 'scene' && scraper.fetchScene) || (type === 'movie' && scraper.fetchMovie)) {
return scraper[type === 'movie' ? 'fetchMovie' : 'fetchScene'](baseRelease.url, entity, baseRelease, options, null);
}
if ((type === 'scene' && scraper.scrapeScene) || (type === 'movie' && scraper.scrapeMovie)) {
if (scraper.useUnprint) {
return fetchUnprintScene(scraper, url, entity, baseRelease, options, type);
}
const session = qu.session();
const res = await qu.get(url, null, null, {

429
src/scrapers/bang-legacy.js Executable file
View File

@@ -0,0 +1,429 @@
'use strict';
const http = require('../utils/http');
const qu = require('../utils/qu');
const { extractDate } = require('../utils/qu');
const { inchesToCm } = require('../utils/convert');
const slugify = require('../utils/slugify');
const capitalize = require('../utils/capitalize');
const clusterId = '617fb597b659459bafe6472470d9073a';
const authKey = 'YmFuZy1yZWFkOktqVDN0RzJacmQ1TFNRazI=';
const genderMap = {
M: 'male',
F: 'female',
};
function getScreenUrl(item, scene) {
if (!scene.dvd?.id || !item?.screenId) {
return null;
}
return `https://i.bang.com/screenshots/${scene.dvd.id}/${scene.type}/${scene.order}/${item.screenId}.jpg`;
}
function encodeId(id) {
return Buffer
.from(id, 'hex')
.toString('base64')
.replace(/\+/g, '-')
.replace(/\//g, '_')
.replace(/=/g, ',');
}
function decodeId(id) {
const restoredId = id
.replace(/-/g, '+')
.replace(/_/g, '/')
.replace(/,/g, '=');
return Buffer
.from(restoredId, 'base64')
.toString('hex');
}
async function fetchPhotos(scene) {
const photoPaths = Array.from({ length: scene.photos }, (value, index) => `/${scene.dvd.id}/${scene.identifier}/final/${String(index + 1).padStart(6, '0')}.jpg`);
const res = await http.post('https://www.bang.com/sign-images', {
images: photoPaths,
}, {
encodeJSON: false,
});
if (res.ok && res.body.images) {
return res.body.images.map((image) => qu.prefixUrl(image, 'https://photos.bang.com'));
}
return null;
}
async function scrapeScene(scene, entity, options) {
const release = {
entryId: scene.id,
title: scene.name || (scene.dvd?.name && scene.type === 'bonus' && capitalize(`${scene.dvd.name} - Bonus Scene ${scene.order || 1}`)) || null,
description: scene.description,
tags: scene.genres.concat(scene.actions).map((genre) => genre.name),
duration: scene.duration,
};
const slug = slugify(release.title);
release.url = `https://www.bang.com/video/${encodeId(release.entryId)}/${slug}`;
const date = new Date(scene.releaseDate);
release.date = new Date(Date.UTC(date.getUTCFullYear(), date.getUTCMonth(), date.getUTCDate()));
release.actors = scene.actors.map((actor) => ({ name: actor.name, gender: genderMap[actor.gender] }));
if (scene.is4k) release.tags.push('4k');
if (scene.gay) release.tags.push('gay');
const defaultPoster = scene.screenshots.find((photo) => photo.default === true);
const screens = scene.screenshots.filter((photo) => photo.default === false);
const remainingScreens = defaultPoster ? screens : screens.slice(1);
const poster = defaultPoster || screens[0];
release.poster = getScreenUrl(poster, scene);
release.photos = remainingScreens.map((photo) => getScreenUrl(photo, scene));
if (options?.includePhotos) {
const photos = await fetchPhotos(scene);
if (photos?.length > 0) {
release.photos = photos;
}
}
release.teaser = `https://i.bang.com/v/${scene.dvd.id}/${scene.identifier}/preview.mp4`;
release.channel = scene.series.name
.replace(/[! .]/g, '')
.replace('&', 'and');
return release;
}
function scrapeAll(scenes, entity) {
return Promise.all(scenes.map(({ _source: scene }) => scrapeScene(scene, entity)));
}
async function fetchActorReleases(actor, entity) {
const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
size: 50,
query: {
bool: {
must: [
{
match: {
status: 'ok',
},
},
{
nested: {
path: 'actors',
query: {
bool: {
must: [
{
match: {
'actors.mongoId': {
operator: 'AND',
query: actor.id,
},
},
},
],
},
},
},
},
],
must_not: [
{
match: {
type: 'trailer',
},
},
],
},
},
sort: [
{
releaseDate: {
order: 'desc',
},
},
],
}, {
encodeJSON: true,
headers: {
Authorization: `Basic ${authKey}`,
},
});
return scrapeAll(res.body.hits.hits, entity);
}
async function scrapeProfile(actor, entity, include) {
const profile = {};
profile.aliases = actor.aliases;
profile.dateOfBirth = extractDate(actor.birthDate);
profile.gender = ({ F: 'female', M: 'male' })[actor.gender];
profile.ethnicity = actor.ethnicity;
profile.nationality = actor.nationality;
profile.birthPlace = `${actor.birthCity}, ${actor.birthCountry || ''}`;
profile.hair = actor.hairColor;
profile.eyes = actor.eyeColor;
profile.naturalBoobs = actor.naturalBreasts;
if (actor.measurements) {
const { cupSize, shoulder, chest, waist, height } = actor.measurements;
if (height) profile.height = inchesToCm(height);
if (cupSize) profile.cup = cupSize;
// [SIC]
if (shoulder) profile.bust = shoulder;
if (chest) profile.waist = chest;
if (waist) profile.hip = waist;
}
if (actor.twitter) profile.social = [`https://www.twitter.com/${actor.twitter}`];
if (actor.image) profile.avatar = `https://i.bang.com/pornstars/${actor.identifier}.jpg`;
if (include.releases) {
profile.releases = await fetchActorReleases(actor, entity);
}
return profile;
}
async function fetchLatest(site, page = 1) {
const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
size: 50,
from: (page - 1) * 50,
query: {
bool: {
must: [
{
match: {
status: 'ok',
},
},
{
range: {
releaseDate: {
lte: 'now',
},
},
},
/*
* global fetch
{
nested: {
path: 'studio',
query: {
bool: {
must: [
{
match: {
'studio.name': {
operator: 'AND',
query: 'bang! originals',
},
},
},
],
},
},
},
},
*/
{
nested: {
path: 'series',
query: {
bool: {
must: [
{
match: {
'series.id': {
operator: 'AND',
query: site.parameters.siteId,
},
},
},
],
},
},
},
},
],
must_not: [
{
match: {
type: 'trailer',
},
},
],
},
},
sort: [
{
releaseDate: {
order: 'desc',
},
},
],
}, {
encodeJSON: true,
headers: {
Authorization: `Basic ${authKey}`,
},
});
return scrapeAll(res.body.hits.hits, site);
}
async function fetchUpcoming(site, page = 1) {
const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
size: 50,
from: (page - 1) * 50,
query: {
bool: {
must: [
{
match: {
status: 'ok',
},
},
{
range: {
releaseDate: {
lte: 'now+7d',
},
},
},
{
nested: {
path: 'series',
query: {
bool: {
must: [
{
match: {
'series.id': {
operator: 'AND',
query: site.parameters.siteId,
},
},
},
],
},
},
},
},
],
must_not: [
{
match: {
type: 'trailer',
},
},
],
},
},
sort: [
{
releaseDate: {
order: 'desc',
},
},
],
}, {
encodeJSON: true,
headers: {
Authorization: `Basic ${authKey}`,
},
});
return scrapeAll(res.body.hits.hits, site);
}
async function fetchScene(url, entity, baseRelease, options) {
if (baseRelease?.entryId) {
// overview and deep data is the same, don't hit server unnecessarily
return baseRelease;
}
const encodedId = new URL(url).pathname.split('/')[2];
const entryId = decodeId(encodedId);
const res = await http.get(`https://${clusterId}.us-east-1.aws.found.io/videos/video/${entryId}`, {
headers: {
Authorization: `Basic ${authKey}`,
},
});
return scrapeScene(res.body._source, entity, options); // eslint-disable-line no-underscore-dangle
}
async function fetchProfile({ name: actorName }, context, include) {
const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/actors/actor/_search`, {
size: 5,
sort: [{
_score: {
order: 'desc',
},
}],
query: {
bool: {
must: [
{
match: {
name: {
query: actorName,
operator: 'and',
},
},
},
{
match: {
status: 'ok',
},
},
],
},
},
}, {
headers: {
Authorization: `Basic ${authKey}`,
},
encodeJSON: true,
});
if (res.ok) {
const actor = res.body.hits.hits.find((hit) => hit._source.name.toLowerCase() === actorName.toLowerCase());
if (actor) {
return scrapeProfile(actor._source, context.entity, include);
}
return null;
}
return res.status;
}
module.exports = {
fetchLatest,
fetchProfile,
fetchScene,
fetchUpcoming,
};

View File

@@ -1,29 +1,15 @@
'use strict';
const http = require('../utils/http');
const qu = require('../utils/qu');
const { extractDate } = require('../utils/qu');
const { inchesToCm } = require('../utils/convert');
const unprint = require('unprint');
const slugify = require('../utils/slugify');
const capitalize = require('../utils/capitalize');
const clusterId = '617fb597b659459bafe6472470d9073a';
const authKey = 'YmFuZy1yZWFkOktqVDN0RzJacmQ1TFNRazI=';
const genderMap = {
M: 'male',
F: 'female',
};
function getScreenUrl(item, scene) {
if (!scene.dvd?.id || !item?.screenId) {
return null;
/*
function encodeId(id) {
if (!id) {
return id;
}
return `https://i.bang.com/screenshots/${scene.dvd.id}/${scene.type}/${scene.order}/${item.screenId}.jpg`;
}
function encodeId(id) {
return Buffer
.from(id, 'hex')
.toString('base64')
@@ -31,8 +17,13 @@ function encodeId(id) {
.replace(/\//g, '_')
.replace(/=/g, ',');
}
*/
function decodeId(id) {
if (!id) {
return id;
}
const restoredId = id
.replace(/-/g, '+')
.replace(/_/g, '/')
@@ -43,387 +34,150 @@ function decodeId(id) {
.toString('hex');
}
async function fetchPhotos(scene) {
const photoPaths = Array.from({ length: scene.photos }, (value, index) => `/${scene.dvd.id}/${scene.identifier}/final/${String(index + 1).padStart(6, '0')}.jpg`);
function scrapeAll(scenes, entity) {
return scenes.map(({ query }) => {
const release = {};
const res = await http.post('https://www.bang.com/sign-images', {
images: photoPaths,
}, {
encodeJSON: false,
release.url = query.url('.video_preview_container > a', { origin: entity.url });
release.entryId = query.attribute(null, 'data-video-id') || decodeId(new URL(release.url).pathname.match(/\/video\/([\w-]+)\//)?.[1]);
release.title = query.content('.video_preview_container >a > span.block');
release.date = query.date('.videoInfo .statistics span', 'MMM DD, YYYY');
release.actors = query.elements('.videoInfo a[href*="/pornstar"]').map((el) => ({
name: unprint.query.content(el),
url: unprint.query.url(el, null, { origin: 'https://www.bang.com' }),
}));
const poster = query.img('img[data-videopreview-target="image"]');
const posterUrl = new URL(poster);
if (poster) {
release.poster = [
`${posterUrl.origin}${posterUrl.pathname}`,
posterUrl.href,
];
}
release.teaser = query.video();
return release;
});
if (res.ok && res.body.images) {
return res.body.images.map((image) => qu.prefixUrl(image, 'https://photos.bang.com'));
}
return null;
}
async function scrapeScene(scene, entity, options) {
const release = {
entryId: scene.id,
title: scene.name || (scene.dvd?.name && scene.type === 'bonus' && capitalize(`${scene.dvd.name} - Bonus Scene ${scene.order || 1}`)) || null,
description: scene.description,
tags: scene.genres.concat(scene.actions).map((genre) => genre.name),
duration: scene.duration,
};
async function scrapeScene({ query }, { url, entity }) {
const release = {};
const data = query.json('script[type="application/ld+json"]');
const slug = slugify(release.title);
release.url = `https://www.bang.com/video/${encodeId(release.entryId)}/${slug}`;
release.entryId = data?.['@id'] || decodeId(new URL(url).pathname.match(/\/video\/([\w-]+)\//)?.[1]);
const date = new Date(scene.releaseDate);
release.date = new Date(Date.UTC(date.getUTCFullYear(), date.getUTCMonth(), date.getUTCDate()));
release.title = data?.name || query.content('.video-heading');
release.description = data?.description || query.content('.expanded p.clear-both');
release.actors = scene.actors.map((actor) => ({ name: actor.name, gender: genderMap[actor.gender] }));
release.date = unprint.extractDate(data?.datePublished, 'YYYY-MM-DD');
release.duration = unprint.extractTimestamp(data?.duration) || query.duration('//p[contains(text(), "Playtime:")]//span');
if (scene.is4k) release.tags.push('4k');
if (scene.gay) release.tags.push('gay');
release.actors = data?.actor.map((actor) => ({
name: actor.name,
url: actor.url,
})) || query.contents('.expanded a[href*="/pornstar"]');
const defaultPoster = scene.screenshots.find((photo) => photo.default === true);
const screens = scene.screenshots.filter((photo) => photo.default === false);
release.tags = query.contents('.expanded .genres');
const remainingScreens = defaultPoster ? screens : screens.slice(1);
const poster = defaultPoster || screens[0];
release.poster = data?.thumbnailUrl || data?.contentUrl || query.attribute('meta[name*="og:image"]', 'content');
release.teaser = query.video('video[data-modal-target="videoImage"] source');
release.poster = getScreenUrl(poster, scene);
release.photos = remainingScreens.map((photo) => getScreenUrl(photo, scene));
release.photos = JSON.parse(query.attribute('[data-video-gallery-photos-value]', 'data-video-gallery-photos-value'));
release.photoCount = query.number('[data-video-gallery-count-value]', { attribute: 'data-video-gallery-count-value' });
if (options?.includePhotos) {
const photos = await fetchPhotos(scene);
const channelName = query.content('.expanded a[href*="?in="]')?.trim();
if (photos?.length > 0) {
release.photos = photos;
}
if (channelName) {
release.channel = entity.children?.find((channel) => new RegExp(channel.name, 'i').test(channelName) || slugify(channelName) === channel.slug)?.slug;
}
release.teaser = `https://i.bang.com/v/${scene.dvd.id}/${scene.identifier}/preview.mp4`;
release.channel = scene.series.name
.replace(/[! .]/g, '')
.replace('&', 'and');
return release;
}
function scrapeAll(scenes, entity) {
return Promise.all(scenes.map(({ _source: scene }) => scrapeScene(scene, entity)));
}
async function fetchActorScenes(element, url, entity, page = 1, acc = []) {
const scenes = scrapeAll(unprint.initAll(element, '.search-grid li'), entity);
async function fetchActorReleases(actor, entity) {
const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
size: 50,
query: {
bool: {
must: [
{
match: {
status: 'ok',
},
},
{
nested: {
path: 'actors',
query: {
bool: {
must: [
{
match: {
'actors.mongoId': {
operator: 'AND',
query: actor.id,
},
},
},
],
},
},
},
},
],
must_not: [
{
match: {
type: 'trailer',
},
},
],
},
},
sort: [
{
releaseDate: {
order: 'desc',
},
},
],
}, {
encodeJSON: true,
headers: {
Authorization: `Basic ${authKey}`,
},
});
if (scenes.length) {
const nextPageRes = await unprint.post(url, { page: page + 1 });
return scrapeAll(res.body.hits.hits, entity);
}
async function scrapeProfile(actor, entity, include) {
const profile = {};
profile.aliases = actor.aliases;
profile.dateOfBirth = extractDate(actor.birthDate);
profile.gender = ({ F: 'female', M: 'male' })[actor.gender];
profile.ethnicity = actor.ethnicity;
profile.nationality = actor.nationality;
profile.birthPlace = `${actor.birthCity}, ${actor.birthCountry || ''}`;
profile.hair = actor.hairColor;
profile.eyes = actor.eyeColor;
profile.naturalBoobs = actor.naturalBreasts;
if (actor.measurements) {
const { cupSize, shoulder, chest, waist, height } = actor.measurements;
if (height) profile.height = inchesToCm(height);
if (cupSize) profile.cup = cupSize;
// [SIC]
if (shoulder) profile.bust = shoulder;
if (chest) profile.waist = chest;
if (waist) profile.hip = waist;
if (nextPageRes.ok) {
return fetchActorScenes(nextPageRes.context.element, url, entity, page + 1, acc.concat(scenes));
}
}
if (actor.twitter) profile.social = [`https://www.twitter.com/${actor.twitter}`];
if (actor.image) profile.avatar = `https://i.bang.com/pornstars/${actor.identifier}.jpg`;
return acc.concat(scenes);
}
if (include.releases) {
profile.releases = await fetchActorReleases(actor, entity);
async function scrapeProfile({ query, element }, url, entity, include) {
const profile = {};
profile.dateOfBirth = query.date('//text()[contains(., "Born")]/following-sibling::span[contains(@class, "font-bold")][1]', 'MMMM D, YYYY');
profile.birthPlace = query.content('//text()[contains(., "in")]/following-sibling::span[contains(@class, "font-bold")][1]');
profile.ethnicity = query.content('//text()[contains(., "Ethnicity")]/following-sibling::span[contains(@class, "font-bold")][1]');
profile.hairColor = query.content('//text()[contains(., "Hair Color")]/following-sibling::span[contains(@class, "font-bold")][1]');
profile.eyes = query.content('//text()[contains(., "Eye Color")]/following-sibling::span[contains(@class, "font-bold")][1]');
const avatar = query.img('img[alt*="profile"][src*="https://i.bang.com/pornstars/"]');
if (avatar) {
const { origin, pathname } = new URL(avatar);
profile.avatar = [
`${origin}${pathname}`, // full size
avatar,
];
}
if (include.scenes) {
profile.scenes = await fetchActorScenes(element, url, entity);
}
return profile;
}
async function fetchLatest(site, page = 1) {
const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
size: 50,
from: (page - 1) * 50,
query: {
bool: {
must: [
{
match: {
status: 'ok',
},
},
{
range: {
releaseDate: {
lte: 'now',
},
},
},
/*
* global fetch
{
nested: {
path: 'studio',
query: {
bool: {
must: [
{
match: {
'studio.name': {
operator: 'AND',
query: 'bang! originals',
},
},
},
],
},
},
},
},
*/
{
nested: {
path: 'series',
query: {
bool: {
must: [
{
match: {
'series.id': {
operator: 'AND',
query: site.parameters.siteId,
},
},
},
],
},
},
},
},
],
must_not: [
{
match: {
type: 'trailer',
},
},
],
},
},
sort: [
{
releaseDate: {
order: 'desc',
},
},
],
}, {
encodeJSON: true,
headers: {
Authorization: `Basic ${authKey}`,
},
});
return scrapeAll(res.body.hits.hits, site);
}
async function fetchUpcoming(site, page = 1) {
const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
size: 50,
from: (page - 1) * 50,
query: {
bool: {
must: [
{
match: {
status: 'ok',
},
},
{
range: {
releaseDate: {
lte: 'now+7d',
},
},
},
{
nested: {
path: 'series',
query: {
bool: {
must: [
{
match: {
'series.id': {
operator: 'AND',
query: site.parameters.siteId,
},
},
},
],
},
},
},
},
],
must_not: [
{
match: {
type: 'trailer',
},
},
],
},
},
sort: [
{
releaseDate: {
order: 'desc',
},
},
],
}, {
encodeJSON: true,
headers: {
Authorization: `Basic ${authKey}`,
},
});
return scrapeAll(res.body.hits.hits, site);
}
async function fetchScene(url, entity, baseRelease, options) {
if (baseRelease?.entryId) {
// overview and deep data is the same, don't hit server unnecessarily
return baseRelease;
}
const encodedId = new URL(url).pathname.split('/')[2];
const entryId = decodeId(encodedId);
const res = await http.get(`https://${clusterId}.us-east-1.aws.found.io/videos/video/${entryId}`, {
headers: {
Authorization: `Basic ${authKey}`,
},
});
return scrapeScene(res.body._source, entity, options); // eslint-disable-line no-underscore-dangle
}
async function fetchProfile({ name: actorName }, context, include) {
const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/actors/actor/_search`, {
size: 5,
sort: [{
_score: {
order: 'desc',
},
}],
query: {
bool: {
must: [
{
match: {
name: {
query: actorName,
operator: 'and',
},
},
},
{
match: {
status: 'ok',
},
},
],
},
},
}, {
headers: {
Authorization: `Basic ${authKey}`,
},
encodeJSON: true,
});
async function fetchLatest(channel, page = 1) {
const url = `${channel.url}&page=${page}`;
const res = await unprint.get(url, { selectAll: '.search-grid li' });
if (res.ok) {
const actor = res.body.hits.hits.find((hit) => hit._source.name.toLowerCase() === actorName.toLowerCase());
if (actor) {
return scrapeProfile(actor._source, context.entity, include);
}
return null;
return scrapeAll(res.context, channel);
}
return res.status;
}
async function fetchProfile({ name: actorName }, { entity }, include) {
const searchRes = await unprint.get(`https://www.bang.com/pornstars?term=${slugify(actorName, '+')}`);
if (!searchRes.ok) {
return searchRes.status;
}
const url = searchRes.context.query.url(`//a[contains(.//span, "${actorName}")]`);
if (!url) {
return null;
}
const actorRes = await unprint.get(url);
if (actorRes.ok) {
return scrapeProfile(actorRes.context, url, entity, include);
}
return actorRes.status;
}
module.exports = {
fetchLatest,
fetchProfile,
fetchScene,
fetchUpcoming,
scrapeScene,
useUnprint: true,
};

View File

@@ -237,6 +237,10 @@ async function destroyBypassSession(sessionId) {
}
async function destroyBypassSessions() {
if (!config.bypass.cloudflare.enabled) {
return;
}
const sessionListRes = await limiters.bypass.schedule(async () => bhttp.post(config.bypass.cloudflare.path, {
cmd: 'sessions.list',
}, {