Added support for release photo fallbacks. Limited photo fetching concurrency. Modifier XEmpire scraper for AllBlackX support and improved photo scraping. Added movie property to Evil Angel scraper.
This commit is contained in:
parent
c26d5b8655
commit
a310f9bb1d
Binary file not shown.
After Width: | Height: | Size: 14 KiB |
|
@ -582,7 +582,7 @@ function getSites(networksMap) {
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
slug: 'pornstarslikeitbig',
|
slug: 'pornstarslikeitbig',
|
||||||
name: 'Pornstars Like it Big',
|
name: 'Pornstars Like It Big',
|
||||||
url: 'https://www.brazzers.com/sites/view/id/24/pornstars-like-it-big',
|
url: 'https://www.brazzers.com/sites/view/id/24/pornstars-like-it-big',
|
||||||
description: "A real big dick, that's what everyone wants. Porn-stars are no exception, all the biggest stars agree; BIG COCK is for them. Check out how it stretches their tiny pussies and cums on their round tits. We've got the best chicks jocking the biggest dicks.",
|
description: "A real big dick, that's what everyone wants. Porn-stars are no exception, all the biggest stars agree; BIG COCK is for them. Check out how it stretches their tiny pussies and cums on their round tits. We've got the best chicks jocking the biggest dicks.",
|
||||||
network_id: networksMap.brazzers,
|
network_id: networksMap.brazzers,
|
||||||
|
@ -2397,6 +2397,13 @@ function getSites(networksMap) {
|
||||||
url: 'https://www.darkx.com',
|
url: 'https://www.darkx.com',
|
||||||
network_id: networksMap.xempire,
|
network_id: networksMap.xempire,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
slug: 'allblackx',
|
||||||
|
name: 'AllBlackX',
|
||||||
|
description: 'AllBlackX.com features the hottest ebony pornstar beauties in hardcore black on black gonzo porn. From director Mason, watch 4k ultra HD videos inside',
|
||||||
|
url: 'https://www.allblackx.com',
|
||||||
|
network_id: networksMap.xempire,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
slug: 'lesbianx',
|
slug: 'lesbianx',
|
||||||
name: 'LesbianX',
|
name: 'LesbianX',
|
||||||
|
|
|
@ -1166,6 +1166,10 @@ function getTagAliases(tagsMap) {
|
||||||
name: 'dp',
|
name: 'dp',
|
||||||
alias_for: tagsMap['double-penetration'],
|
alias_for: tagsMap['double-penetration'],
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: 'first dp',
|
||||||
|
alias_for: tagsMap['double-penetration'],
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: 'double penetration (dp)',
|
name: 'double penetration (dp)',
|
||||||
alias_for: tagsMap['double-penetration'],
|
alias_for: tagsMap['double-penetration'],
|
||||||
|
|
30
src/media.js
30
src/media.js
|
@ -10,7 +10,6 @@ const sharp = require('sharp');
|
||||||
const blake2 = require('blake2');
|
const blake2 = require('blake2');
|
||||||
|
|
||||||
const knex = require('./knex');
|
const knex = require('./knex');
|
||||||
const pluckPhotos = require('./utils/pluck-photos');
|
|
||||||
|
|
||||||
function getHash(buffer) {
|
function getHash(buffer) {
|
||||||
const hash = blake2.createHash('blake2b', { digestLength: 24 });
|
const hash = blake2.createHash('blake2b', { digestLength: 24 });
|
||||||
|
@ -20,6 +19,21 @@ function getHash(buffer) {
|
||||||
return hash.digest('hex');
|
return hash.digest('hex');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function pluckPhotos(photos, release, specifiedLimit) {
|
||||||
|
const limit = specifiedLimit || config.media.limit;
|
||||||
|
|
||||||
|
if (photos.length <= limit) {
|
||||||
|
return photos;
|
||||||
|
}
|
||||||
|
|
||||||
|
const plucked = [1]
|
||||||
|
.concat(
|
||||||
|
Array.from({ length: limit }, (value, index) => Math.round((index + 1) * (photos.length / (limit)))),
|
||||||
|
);
|
||||||
|
|
||||||
|
return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close
|
||||||
|
}
|
||||||
|
|
||||||
async function getThumbnail(buffer) {
|
async function getThumbnail(buffer) {
|
||||||
return sharp(buffer)
|
return sharp(buffer)
|
||||||
.resize({
|
.resize({
|
||||||
|
@ -94,7 +108,12 @@ async function filterHashDuplicates(files, domains = ['releases'], roles = ['pho
|
||||||
return files.filter(file => file && !photoHashes.has(file.hash));
|
return files.filter(file => file && !photoHashes.has(file.hash));
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchPhoto(photoUrl, index, identifier) {
|
async function fetchPhoto(photoUrl, index, identifier, attempt = 1) {
|
||||||
|
if (Array.isArray(photoUrl)) {
|
||||||
|
return fetchPhoto(photoUrl[0], index, identifier);
|
||||||
|
// return photoUrl.reduce(async (outcome, url) => outcome.catch(async () => fetchPhoto(url, index, identifier)), Promise.reject());
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { pathname } = new URL(photoUrl);
|
const { pathname } = new URL(photoUrl);
|
||||||
const mimetype = mime.getType(pathname);
|
const mimetype = mime.getType(pathname);
|
||||||
|
@ -116,7 +135,12 @@ async function fetchPhoto(photoUrl, index, identifier) {
|
||||||
|
|
||||||
throw new Error(`Response ${res.statusCode} not OK`);
|
throw new Error(`Response ${res.statusCode} not OK`);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.warn(`Failed to store photo ${index + 1} (${photoUrl}) for ${identifier}: ${error}`);
|
console.warn(`Failed attempt ${attempt}/3 to fetch photo ${index + 1} (${photoUrl}) for ${identifier}: ${error}`);
|
||||||
|
|
||||||
|
if (attempt < 3) {
|
||||||
|
await Promise.delay(1000);
|
||||||
|
return fetchPhoto(photoUrl, index, identifier, attempt + 1);
|
||||||
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,6 +39,7 @@ async function findNetworkByUrl(url) {
|
||||||
|
|
||||||
const network = await knex('networks')
|
const network = await knex('networks')
|
||||||
.where('networks.url', 'like', `%${domain}`)
|
.where('networks.url', 'like', `%${domain}`)
|
||||||
|
.orWhere('networks.url', url)
|
||||||
.first();
|
.first();
|
||||||
|
|
||||||
if (network) {
|
if (network) {
|
||||||
|
|
|
@ -248,7 +248,6 @@ async function storeReleaseAssets(release, releaseId) {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await Promise.all([
|
await Promise.all([
|
||||||
associateTags(release, releaseId),
|
|
||||||
storePhotos(release, releaseId),
|
storePhotos(release, releaseId),
|
||||||
storePoster(release, releaseId),
|
storePoster(release, releaseId),
|
||||||
storeTrailer(release, releaseId),
|
storeTrailer(release, releaseId),
|
||||||
|
@ -275,17 +274,22 @@ async function storeRelease(release) {
|
||||||
})
|
})
|
||||||
.returning('*');
|
.returning('*');
|
||||||
|
|
||||||
// await storeReleaseAssets(release, existingRelease.id);
|
if (updatedRelease) {
|
||||||
console.log(`Updated release "${release.title}" (${existingRelease.id}, ${release.site.name})`);
|
await associateTags(release, updatedRelease.id);
|
||||||
|
console.log(`Updated release "${release.title}" (${existingRelease.id}, ${release.site.name})`);
|
||||||
|
}
|
||||||
|
|
||||||
return updatedRelease ? updatedRelease.id : existingRelease.id;
|
await associateTags(release, existingRelease.id);
|
||||||
|
|
||||||
|
return existingRelease.id;
|
||||||
}
|
}
|
||||||
|
|
||||||
const [releaseEntry] = await knex('releases')
|
const [releaseEntry] = await knex('releases')
|
||||||
.insert(curatedRelease)
|
.insert(curatedRelease)
|
||||||
.returning('*');
|
.returning('*');
|
||||||
|
|
||||||
// await storeReleaseAssets(release, releaseEntry.id);
|
await associateTags(release, releaseEntry.id);
|
||||||
|
|
||||||
console.log(`Stored release "${release.title}" (${releaseEntry.id}, ${release.site.name})`);
|
console.log(`Stored release "${release.title}" (${releaseEntry.id}, ${release.site.name})`);
|
||||||
|
|
||||||
return releaseEntry.id;
|
return releaseEntry.id;
|
||||||
|
@ -334,7 +338,9 @@ async function storeReleases(releases) {
|
||||||
|
|
||||||
await Promise.all([
|
await Promise.all([
|
||||||
associateActors(actors, storedReleases),
|
associateActors(actors, storedReleases),
|
||||||
Promise.all(storedReleases.map(async release => storeReleaseAssets(release, release.id))),
|
Promise.map(storedReleases, async release => storeReleaseAssets(release, release.id), {
|
||||||
|
concurrency: 10,
|
||||||
|
}),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
return storedReleases;
|
return storedReleases;
|
||||||
|
|
|
@ -7,7 +7,6 @@ const { JSDOM } = require('jsdom');
|
||||||
const moment = require('moment');
|
const moment = require('moment');
|
||||||
|
|
||||||
const { heightToCm } = require('../utils/convert');
|
const { heightToCm } = require('../utils/convert');
|
||||||
const { matchTags } = require('../tags');
|
|
||||||
|
|
||||||
async function fetchPhotos(url) {
|
async function fetchPhotos(url) {
|
||||||
const res = await bhttp.get(url);
|
const res = await bhttp.get(url);
|
||||||
|
@ -22,13 +21,8 @@ function scrapePhotos(html) {
|
||||||
.map((photoIndex, photoElement) => {
|
.map((photoIndex, photoElement) => {
|
||||||
const src = $(photoElement).attr('src');
|
const src = $(photoElement).attr('src');
|
||||||
|
|
||||||
if (src.match(/dl\d+/)) {
|
// high res often available in photos/ directory, but not always, provide original as fallback
|
||||||
// thumbnail URLs containing dl02/ or dl03/ don't appear to have
|
return [src.replace('thumbs/', 'photos/'), src];
|
||||||
// a full photo available, fall back to thumbnail
|
|
||||||
return src;
|
|
||||||
}
|
|
||||||
|
|
||||||
return src.replace('thumbs/', 'photos/');
|
|
||||||
})
|
})
|
||||||
.toArray();
|
.toArray();
|
||||||
|
|
||||||
|
@ -172,8 +166,8 @@ async function scrapeScene(html, url, site) {
|
||||||
|
|
||||||
const photos = await getPhotos(entryId, site);
|
const photos = await getPhotos(entryId, site);
|
||||||
|
|
||||||
const rawTags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
|
const tags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
|
||||||
const tags = await matchTags(rawTags);
|
const movie = $('.update_dvds a').href;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url,
|
url,
|
||||||
|
@ -184,6 +178,7 @@ async function scrapeScene(html, url, site) {
|
||||||
description,
|
description,
|
||||||
poster,
|
poster,
|
||||||
photos,
|
photos,
|
||||||
|
movie,
|
||||||
trailer: {
|
trailer: {
|
||||||
src: trailer,
|
src: trailer,
|
||||||
quality: 720,
|
quality: 720,
|
||||||
|
|
|
@ -6,14 +6,12 @@ const cheerio = require('cheerio');
|
||||||
const { JSDOM } = require('jsdom');
|
const { JSDOM } = require('jsdom');
|
||||||
const moment = require('moment');
|
const moment = require('moment');
|
||||||
|
|
||||||
const { fetchSites } = require('../sites');
|
|
||||||
const { matchTags } = require('../tags');
|
|
||||||
|
|
||||||
const defaultTags = {
|
const defaultTags = {
|
||||||
hardx: [],
|
hardx: [],
|
||||||
darkx: ['interracial'],
|
darkx: ['interracial'],
|
||||||
eroticax: [],
|
eroticax: [],
|
||||||
lesbianx: ['lesbian'],
|
lesbianx: ['lesbian'],
|
||||||
|
allblackx: ['ebony', 'bbc'],
|
||||||
};
|
};
|
||||||
|
|
||||||
async function fetchPhotos(url) {
|
async function fetchPhotos(url) {
|
||||||
|
@ -25,37 +23,56 @@ async function fetchPhotos(url) {
|
||||||
function scrapePhotos(html) {
|
function scrapePhotos(html) {
|
||||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||||
|
|
||||||
const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
|
return $('.preview .imgLink').toArray().map((linkEl) => {
|
||||||
.map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
|
const url = $(linkEl).attr('href');
|
||||||
|
|
||||||
const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
|
if (url.match('/join')) {
|
||||||
.map((photoIndex, photoElement) => $(photoElement)
|
// URL links to join page instead of full photo, extract thumbnail
|
||||||
.attr('src'))
|
const src = $(linkEl).find('img').attr('src');
|
||||||
// .replace('_tb.jpg', '.jpg')) does not always work
|
|
||||||
.toArray();
|
|
||||||
|
|
||||||
return unlockedPhotos.concat(lockedThumbnails);
|
if (src.match('previews/')) {
|
||||||
|
// resource often serves full photo at a modifier URL anyway, add as primary source
|
||||||
|
const highRes = src
|
||||||
|
.replace('previews/', '')
|
||||||
|
.replace('_tb.jpg', '.jpg');
|
||||||
|
|
||||||
|
// keep original thumbnail as fallback in case full photo is not available
|
||||||
|
return [highRes, src];
|
||||||
|
}
|
||||||
|
|
||||||
|
return src;
|
||||||
|
}
|
||||||
|
|
||||||
|
// URL links to full photo
|
||||||
|
return url;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async function getPhotos(albumPath, siteDomain) {
|
async function getPhotos(albumPath, siteDomain) {
|
||||||
const albumUrl = `https://${siteDomain}${albumPath}`;
|
const albumUrl = `https://${siteDomain}${albumPath}`;
|
||||||
|
|
||||||
const html = await fetchPhotos(albumUrl);
|
try {
|
||||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
const html = await fetchPhotos(albumUrl);
|
||||||
const photos = scrapePhotos(html);
|
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||||
|
const photos = scrapePhotos(html);
|
||||||
|
|
||||||
const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
|
const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
|
||||||
|
|
||||||
const otherPhotos = await Promise.map(pages, async (page) => {
|
const otherPhotos = await Promise.map(pages, async (page) => {
|
||||||
const pageUrl = `https://${siteDomain}${page}`;
|
const pageUrl = `https://${siteDomain}${page}`;
|
||||||
const pageHtml = await fetchPhotos(pageUrl);
|
const pageHtml = await fetchPhotos(pageUrl);
|
||||||
|
|
||||||
return scrapePhotos(pageHtml);
|
return scrapePhotos(pageHtml);
|
||||||
}, {
|
}, {
|
||||||
concurrency: 2,
|
concurrency: 2,
|
||||||
});
|
});
|
||||||
|
|
||||||
return photos.concat(otherPhotos.flat());
|
return photos.concat(otherPhotos.flat());
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Failed to fetch XEmpire photos from ${albumPath}: ${error.message}`);
|
||||||
|
|
||||||
|
return [];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function scrape(html, site) {
|
function scrape(html, site) {
|
||||||
|
@ -109,32 +126,26 @@ function scrape(html, site) {
|
||||||
async function scrapeScene(html, url, site) {
|
async function scrapeScene(html, url, site) {
|
||||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||||
const json = $('script[type="application/ld+json"]').html();
|
const json = $('script[type="application/ld+json"]').html();
|
||||||
|
const json2 = $('script:contains("dataLayer = ")').html();
|
||||||
const videoJson = $('script:contains("window.ScenePlayerOptions")').html();
|
const videoJson = $('script:contains("window.ScenePlayerOptions")').html();
|
||||||
|
|
||||||
const data = JSON.parse(json)[0];
|
const data = JSON.parse(json)[0];
|
||||||
|
const data2 = JSON.parse(json2.slice(json2.indexOf('[{'), -1))[0];
|
||||||
const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1));
|
const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1));
|
||||||
|
|
||||||
const entryId = new URL(url).pathname.split('/').slice(-1)[0];
|
const entryId = data2.sceneDetails.sceneId || new URL(url).pathname.split('/').slice(-1)[0];
|
||||||
|
|
||||||
const title = $('meta[name="twitter:title"]').attr('content');
|
const title = data2.sceneDetails.sceneTitle || $('meta[name="twitter:title"]').attr('content');
|
||||||
const description = data.description || $('meta[name="twitter:description"]').attr('content');
|
const description = data2.sceneDetails.sceneDescription || data.description || $('meta[name="twitter:description"]').attr('content');
|
||||||
// date in data object is not the release date of the scene, but the date the entry was added
|
// date in data object is not the release date of the scene, but the date the entry was added
|
||||||
const date = moment.utc($('.updatedDate').first().text(), 'MM-DD-YYYY').toDate();
|
const date = moment.utc($('.updatedDate').first().text(), 'MM-DD-YYYY').toDate();
|
||||||
|
|
||||||
const actors = data.actor
|
const actors = (data2.sceneDetails.sceneActors || data.actor).map(actor => actor.actorName || actor.name);
|
||||||
.sort(({ gender: genderA }, { gender: genderB }) => {
|
|
||||||
if (genderA === 'female' && genderB === 'male') return -1;
|
|
||||||
if (genderA === 'male' && genderB === 'female') return 1;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
})
|
|
||||||
.map(actor => actor.name);
|
|
||||||
|
|
||||||
const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5;
|
const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5;
|
||||||
|
|
||||||
const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();
|
const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();
|
||||||
|
|
||||||
const siteDomain = $('meta[name="twitter:domain"]').attr('content');
|
const siteDomain = $('meta[name="twitter:domain"]').attr('content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
|
||||||
const siteSlug = siteDomain && siteDomain.split('.')[0].toLowerCase();
|
const siteSlug = siteDomain && siteDomain.split('.')[0].toLowerCase();
|
||||||
const siteUrl = siteDomain && `https://www.${siteDomain}`;
|
const siteUrl = siteDomain && `https://www.${siteDomain}`;
|
||||||
|
|
||||||
|
@ -144,19 +155,10 @@ async function scrapeScene(html, url, site) {
|
||||||
const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain, site);
|
const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain, site);
|
||||||
|
|
||||||
const rawTags = data.keywords.split(', ');
|
const rawTags = data.keywords.split(', ');
|
||||||
|
const tags = [...defaultTags[siteSlug], ...rawTags];
|
||||||
const [[channelSite], tags] = await Promise.all([
|
|
||||||
site.isFallback
|
|
||||||
? fetchSites({
|
|
||||||
url: siteUrl,
|
|
||||||
slug: siteSlug,
|
|
||||||
})
|
|
||||||
: [site],
|
|
||||||
matchTags([...defaultTags[siteSlug], ...rawTags]),
|
|
||||||
]);
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url: channelSite ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}` : url,
|
url: `${siteUrl}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}`,
|
||||||
entryId,
|
entryId,
|
||||||
title,
|
title,
|
||||||
date,
|
date,
|
||||||
|
@ -174,7 +176,8 @@ async function scrapeScene(html, url, site) {
|
||||||
rating: {
|
rating: {
|
||||||
stars,
|
stars,
|
||||||
},
|
},
|
||||||
site: channelSite || site,
|
site,
|
||||||
|
channel: siteSlug,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -62,11 +62,14 @@ async function findSiteByUrl(url) {
|
||||||
'sites.*',
|
'sites.*',
|
||||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||||
)
|
)
|
||||||
.where('sites.url', 'like', `%${domain}%`)
|
.where('sites.url', 'like', `%${domain}`)
|
||||||
|
.orWhere('sites.url', url)
|
||||||
.first();
|
.first();
|
||||||
|
|
||||||
if (site) {
|
if (site) {
|
||||||
return curateSite(site, true);
|
const curatedSite = curateSite(site, true);
|
||||||
|
|
||||||
|
return curatedSite;
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
|
|
Loading…
Reference in New Issue