Added support for release photo fallbacks. Limited photo fetching concurrency. Modifier XEmpire scraper for AllBlackX support and improved photo scraping. Added movie property to Evil Angel scraper.

This commit is contained in:
ThePendulum 2019-12-12 03:12:05 +01:00
parent c26d5b8655
commit a310f9bb1d
9 changed files with 113 additions and 70 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

View File

@ -582,7 +582,7 @@ function getSites(networksMap) {
},
{
slug: 'pornstarslikeitbig',
name: 'Pornstars Like it Big',
name: 'Pornstars Like It Big',
url: 'https://www.brazzers.com/sites/view/id/24/pornstars-like-it-big',
description: "A real big dick, that's what everyone wants. Porn-stars are no exception, all the biggest stars agree; BIG COCK is for them. Check out how it stretches their tiny pussies and cums on their round tits. We've got the best chicks jocking the biggest dicks.",
network_id: networksMap.brazzers,
@ -2397,6 +2397,13 @@ function getSites(networksMap) {
url: 'https://www.darkx.com',
network_id: networksMap.xempire,
},
{
slug: 'allblackx',
name: 'AllBlackX',
description: 'AllBlackX.com features the hottest ebony pornstar beauties in hardcore black on black gonzo porn. From director Mason, watch 4k ultra HD videos inside',
url: 'https://www.allblackx.com',
network_id: networksMap.xempire,
},
{
slug: 'lesbianx',
name: 'LesbianX',

View File

@ -1166,6 +1166,10 @@ function getTagAliases(tagsMap) {
name: 'dp',
alias_for: tagsMap['double-penetration'],
},
{
name: 'first dp',
alias_for: tagsMap['double-penetration'],
},
{
name: 'double penetration (dp)',
alias_for: tagsMap['double-penetration'],

View File

@ -10,7 +10,6 @@ const sharp = require('sharp');
const blake2 = require('blake2');
const knex = require('./knex');
const pluckPhotos = require('./utils/pluck-photos');
function getHash(buffer) {
const hash = blake2.createHash('blake2b', { digestLength: 24 });
@ -20,6 +19,21 @@ function getHash(buffer) {
return hash.digest('hex');
}
function pluckPhotos(photos, release, specifiedLimit) {
const limit = specifiedLimit || config.media.limit;
if (photos.length <= limit) {
return photos;
}
const plucked = [1]
.concat(
Array.from({ length: limit }, (value, index) => Math.round((index + 1) * (photos.length / (limit)))),
);
return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close
}
async function getThumbnail(buffer) {
return sharp(buffer)
.resize({
@ -94,7 +108,12 @@ async function filterHashDuplicates(files, domains = ['releases'], roles = ['pho
return files.filter(file => file && !photoHashes.has(file.hash));
}
async function fetchPhoto(photoUrl, index, identifier) {
async function fetchPhoto(photoUrl, index, identifier, attempt = 1) {
if (Array.isArray(photoUrl)) {
return fetchPhoto(photoUrl[0], index, identifier);
// return photoUrl.reduce(async (outcome, url) => outcome.catch(async () => fetchPhoto(url, index, identifier)), Promise.reject());
}
try {
const { pathname } = new URL(photoUrl);
const mimetype = mime.getType(pathname);
@ -116,7 +135,12 @@ async function fetchPhoto(photoUrl, index, identifier) {
throw new Error(`Response ${res.statusCode} not OK`);
} catch (error) {
console.warn(`Failed to store photo ${index + 1} (${photoUrl}) for ${identifier}: ${error}`);
console.warn(`Failed attempt ${attempt}/3 to fetch photo ${index + 1} (${photoUrl}) for ${identifier}: ${error}`);
if (attempt < 3) {
await Promise.delay(1000);
return fetchPhoto(photoUrl, index, identifier, attempt + 1);
}
return null;
}

View File

@ -39,6 +39,7 @@ async function findNetworkByUrl(url) {
const network = await knex('networks')
.where('networks.url', 'like', `%${domain}`)
.orWhere('networks.url', url)
.first();
if (network) {

View File

@ -248,7 +248,6 @@ async function storeReleaseAssets(release, releaseId) {
try {
await Promise.all([
associateTags(release, releaseId),
storePhotos(release, releaseId),
storePoster(release, releaseId),
storeTrailer(release, releaseId),
@ -275,17 +274,22 @@ async function storeRelease(release) {
})
.returning('*');
// await storeReleaseAssets(release, existingRelease.id);
console.log(`Updated release "${release.title}" (${existingRelease.id}, ${release.site.name})`);
if (updatedRelease) {
await associateTags(release, updatedRelease.id);
console.log(`Updated release "${release.title}" (${existingRelease.id}, ${release.site.name})`);
}
return updatedRelease ? updatedRelease.id : existingRelease.id;
await associateTags(release, existingRelease.id);
return existingRelease.id;
}
const [releaseEntry] = await knex('releases')
.insert(curatedRelease)
.returning('*');
// await storeReleaseAssets(release, releaseEntry.id);
await associateTags(release, releaseEntry.id);
console.log(`Stored release "${release.title}" (${releaseEntry.id}, ${release.site.name})`);
return releaseEntry.id;
@ -334,7 +338,9 @@ async function storeReleases(releases) {
await Promise.all([
associateActors(actors, storedReleases),
Promise.all(storedReleases.map(async release => storeReleaseAssets(release, release.id))),
Promise.map(storedReleases, async release => storeReleaseAssets(release, release.id), {
concurrency: 10,
}),
]);
return storedReleases;

View File

@ -7,7 +7,6 @@ const { JSDOM } = require('jsdom');
const moment = require('moment');
const { heightToCm } = require('../utils/convert');
const { matchTags } = require('../tags');
async function fetchPhotos(url) {
const res = await bhttp.get(url);
@ -22,13 +21,8 @@ function scrapePhotos(html) {
.map((photoIndex, photoElement) => {
const src = $(photoElement).attr('src');
if (src.match(/dl\d+/)) {
// thumbnail URLs containing dl02/ or dl03/ don't appear to have
// a full photo available, fall back to thumbnail
return src;
}
return src.replace('thumbs/', 'photos/');
// high res often available in photos/ directory, but not always, provide original as fallback
return [src.replace('thumbs/', 'photos/'), src];
})
.toArray();
@ -172,8 +166,8 @@ async function scrapeScene(html, url, site) {
const photos = await getPhotos(entryId, site);
const rawTags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const tags = await matchTags(rawTags);
const tags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const movie = $('.update_dvds a').href;
return {
url,
@ -184,6 +178,7 @@ async function scrapeScene(html, url, site) {
description,
poster,
photos,
movie,
trailer: {
src: trailer,
quality: 720,

View File

@ -6,14 +6,12 @@ const cheerio = require('cheerio');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const { fetchSites } = require('../sites');
const { matchTags } = require('../tags');
const defaultTags = {
hardx: [],
darkx: ['interracial'],
eroticax: [],
lesbianx: ['lesbian'],
allblackx: ['ebony', 'bbc'],
};
async function fetchPhotos(url) {
@ -25,37 +23,56 @@ async function fetchPhotos(url) {
function scrapePhotos(html) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
.map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
return $('.preview .imgLink').toArray().map((linkEl) => {
const url = $(linkEl).attr('href');
const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
.map((photoIndex, photoElement) => $(photoElement)
.attr('src'))
// .replace('_tb.jpg', '.jpg')) does not always work
.toArray();
if (url.match('/join')) {
// URL links to join page instead of full photo, extract thumbnail
const src = $(linkEl).find('img').attr('src');
return unlockedPhotos.concat(lockedThumbnails);
if (src.match('previews/')) {
// resource often serves full photo at a modifier URL anyway, add as primary source
const highRes = src
.replace('previews/', '')
.replace('_tb.jpg', '.jpg');
// keep original thumbnail as fallback in case full photo is not available
return [highRes, src];
}
return src;
}
// URL links to full photo
return url;
});
}
async function getPhotos(albumPath, siteDomain) {
const albumUrl = `https://${siteDomain}${albumPath}`;
const html = await fetchPhotos(albumUrl);
const $ = cheerio.load(html, { normalizeWhitespace: true });
const photos = scrapePhotos(html);
try {
const html = await fetchPhotos(albumUrl);
const $ = cheerio.load(html, { normalizeWhitespace: true });
const photos = scrapePhotos(html);
const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
const otherPhotos = await Promise.map(pages, async (page) => {
const pageUrl = `https://${siteDomain}${page}`;
const pageHtml = await fetchPhotos(pageUrl);
const otherPhotos = await Promise.map(pages, async (page) => {
const pageUrl = `https://${siteDomain}${page}`;
const pageHtml = await fetchPhotos(pageUrl);
return scrapePhotos(pageHtml);
}, {
concurrency: 2,
});
return scrapePhotos(pageHtml);
}, {
concurrency: 2,
});
return photos.concat(otherPhotos.flat());
return photos.concat(otherPhotos.flat());
} catch (error) {
console.error(`Failed to fetch XEmpire photos from ${albumPath}: ${error.message}`);
return [];
}
}
function scrape(html, site) {
@ -109,32 +126,26 @@ function scrape(html, site) {
async function scrapeScene(html, url, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const json = $('script[type="application/ld+json"]').html();
const json2 = $('script:contains("dataLayer = ")').html();
const videoJson = $('script:contains("window.ScenePlayerOptions")').html();
const data = JSON.parse(json)[0];
const data2 = JSON.parse(json2.slice(json2.indexOf('[{'), -1))[0];
const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1));
const entryId = new URL(url).pathname.split('/').slice(-1)[0];
const entryId = data2.sceneDetails.sceneId || new URL(url).pathname.split('/').slice(-1)[0];
const title = $('meta[name="twitter:title"]').attr('content');
const description = data.description || $('meta[name="twitter:description"]').attr('content');
const title = data2.sceneDetails.sceneTitle || $('meta[name="twitter:title"]').attr('content');
const description = data2.sceneDetails.sceneDescription || data.description || $('meta[name="twitter:description"]').attr('content');
// date in data object is not the release date of the scene, but the date the entry was added
const date = moment.utc($('.updatedDate').first().text(), 'MM-DD-YYYY').toDate();
const actors = data.actor
.sort(({ gender: genderA }, { gender: genderB }) => {
if (genderA === 'female' && genderB === 'male') return -1;
if (genderA === 'male' && genderB === 'female') return 1;
return 0;
})
.map(actor => actor.name);
const actors = (data2.sceneDetails.sceneActors || data.actor).map(actor => actor.actorName || actor.name);
const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5;
const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();
const siteDomain = $('meta[name="twitter:domain"]').attr('content');
const siteDomain = $('meta[name="twitter:domain"]').attr('content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
const siteSlug = siteDomain && siteDomain.split('.')[0].toLowerCase();
const siteUrl = siteDomain && `https://www.${siteDomain}`;
@ -144,19 +155,10 @@ async function scrapeScene(html, url, site) {
const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain, site);
const rawTags = data.keywords.split(', ');
const [[channelSite], tags] = await Promise.all([
site.isFallback
? fetchSites({
url: siteUrl,
slug: siteSlug,
})
: [site],
matchTags([...defaultTags[siteSlug], ...rawTags]),
]);
const tags = [...defaultTags[siteSlug], ...rawTags];
return {
url: channelSite ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}` : url,
url: `${siteUrl}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}`,
entryId,
title,
date,
@ -174,7 +176,8 @@ async function scrapeScene(html, url, site) {
rating: {
stars,
},
site: channelSite || site,
site,
channel: siteSlug,
};
}

View File

@ -62,11 +62,14 @@ async function findSiteByUrl(url) {
'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
)
.where('sites.url', 'like', `%${domain}%`)
.where('sites.url', 'like', `%${domain}`)
.orWhere('sites.url', url)
.first();
if (site) {
return curateSite(site, true);
const curatedSite = curateSite(site, true);
return curatedSite;
}
return null;