diff --git a/public/img/logos/xempire/allblackx.png b/public/img/logos/xempire/allblackx.png new file mode 100644 index 00000000..4e542b42 Binary files /dev/null and b/public/img/logos/xempire/allblackx.png differ diff --git a/seeds/01_sites.js b/seeds/01_sites.js index 705df1a4..3274be06 100644 --- a/seeds/01_sites.js +++ b/seeds/01_sites.js @@ -582,7 +582,7 @@ function getSites(networksMap) { }, { slug: 'pornstarslikeitbig', - name: 'Pornstars Like it Big', + name: 'Pornstars Like It Big', url: 'https://www.brazzers.com/sites/view/id/24/pornstars-like-it-big', description: "A real big dick, that's what everyone wants. Porn-stars are no exception, all the biggest stars agree; BIG COCK is for them. Check out how it stretches their tiny pussies and cums on their round tits. We've got the best chicks jocking the biggest dicks.", network_id: networksMap.brazzers, @@ -2397,6 +2397,13 @@ function getSites(networksMap) { url: 'https://www.darkx.com', network_id: networksMap.xempire, }, + { + slug: 'allblackx', + name: 'AllBlackX', + description: 'AllBlackX.com features the hottest ebony pornstar beauties in hardcore black on black gonzo porn. From director Mason, watch 4k ultra HD videos inside', + url: 'https://www.allblackx.com', + network_id: networksMap.xempire, + }, { slug: 'lesbianx', name: 'LesbianX', diff --git a/seeds/03_tags.js b/seeds/03_tags.js index 67fc8fe9..ffff0f83 100644 --- a/seeds/03_tags.js +++ b/seeds/03_tags.js @@ -1166,6 +1166,10 @@ function getTagAliases(tagsMap) { name: 'dp', alias_for: tagsMap['double-penetration'], }, + { + name: 'first dp', + alias_for: tagsMap['double-penetration'], + }, { name: 'double penetration (dp)', alias_for: tagsMap['double-penetration'], diff --git a/src/media.js b/src/media.js index 010e1a27..f555711a 100644 --- a/src/media.js +++ b/src/media.js @@ -10,7 +10,6 @@ const sharp = require('sharp'); const blake2 = require('blake2'); const knex = require('./knex'); -const pluckPhotos = require('./utils/pluck-photos'); function getHash(buffer) { const hash = blake2.createHash('blake2b', { digestLength: 24 }); @@ -20,6 +19,21 @@ function getHash(buffer) { return hash.digest('hex'); } +function pluckPhotos(photos, release, specifiedLimit) { + const limit = specifiedLimit || config.media.limit; + + if (photos.length <= limit) { + return photos; + } + + const plucked = [1] + .concat( + Array.from({ length: limit }, (value, index) => Math.round((index + 1) * (photos.length / (limit)))), + ); + + return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close +} + async function getThumbnail(buffer) { return sharp(buffer) .resize({ @@ -94,7 +108,12 @@ async function filterHashDuplicates(files, domains = ['releases'], roles = ['pho return files.filter(file => file && !photoHashes.has(file.hash)); } -async function fetchPhoto(photoUrl, index, identifier) { +async function fetchPhoto(photoUrl, index, identifier, attempt = 1) { + if (Array.isArray(photoUrl)) { + return fetchPhoto(photoUrl[0], index, identifier); + // return photoUrl.reduce(async (outcome, url) => outcome.catch(async () => fetchPhoto(url, index, identifier)), Promise.reject()); + } + try { const { pathname } = new URL(photoUrl); const mimetype = mime.getType(pathname); @@ -116,7 +135,12 @@ async function fetchPhoto(photoUrl, index, identifier) { throw new Error(`Response ${res.statusCode} not OK`); } catch (error) { - console.warn(`Failed to store photo ${index + 1} (${photoUrl}) for ${identifier}: ${error}`); + console.warn(`Failed attempt ${attempt}/3 to fetch photo ${index + 1} (${photoUrl}) for ${identifier}: ${error}`); + + if (attempt < 3) { + await Promise.delay(1000); + return fetchPhoto(photoUrl, index, identifier, attempt + 1); + } return null; } diff --git a/src/networks.js b/src/networks.js index 37ce81f2..ab1da0d5 100644 --- a/src/networks.js +++ b/src/networks.js @@ -39,6 +39,7 @@ async function findNetworkByUrl(url) { const network = await knex('networks') .where('networks.url', 'like', `%${domain}`) + .orWhere('networks.url', url) .first(); if (network) { diff --git a/src/releases.js b/src/releases.js index 17f2a120..25d108ed 100644 --- a/src/releases.js +++ b/src/releases.js @@ -248,7 +248,6 @@ async function storeReleaseAssets(release, releaseId) { try { await Promise.all([ - associateTags(release, releaseId), storePhotos(release, releaseId), storePoster(release, releaseId), storeTrailer(release, releaseId), @@ -275,17 +274,22 @@ async function storeRelease(release) { }) .returning('*'); - // await storeReleaseAssets(release, existingRelease.id); - console.log(`Updated release "${release.title}" (${existingRelease.id}, ${release.site.name})`); + if (updatedRelease) { + await associateTags(release, updatedRelease.id); + console.log(`Updated release "${release.title}" (${existingRelease.id}, ${release.site.name})`); + } - return updatedRelease ? updatedRelease.id : existingRelease.id; + await associateTags(release, existingRelease.id); + + return existingRelease.id; } const [releaseEntry] = await knex('releases') .insert(curatedRelease) .returning('*'); - // await storeReleaseAssets(release, releaseEntry.id); + await associateTags(release, releaseEntry.id); + console.log(`Stored release "${release.title}" (${releaseEntry.id}, ${release.site.name})`); return releaseEntry.id; @@ -334,7 +338,9 @@ async function storeReleases(releases) { await Promise.all([ associateActors(actors, storedReleases), - Promise.all(storedReleases.map(async release => storeReleaseAssets(release, release.id))), + Promise.map(storedReleases, async release => storeReleaseAssets(release, release.id), { + concurrency: 10, + }), ]); return storedReleases; diff --git a/src/scrapers/julesjordan.js b/src/scrapers/julesjordan.js index 10147447..a7fd5a85 100644 --- a/src/scrapers/julesjordan.js +++ b/src/scrapers/julesjordan.js @@ -7,7 +7,6 @@ const { JSDOM } = require('jsdom'); const moment = require('moment'); const { heightToCm } = require('../utils/convert'); -const { matchTags } = require('../tags'); async function fetchPhotos(url) { const res = await bhttp.get(url); @@ -22,13 +21,8 @@ function scrapePhotos(html) { .map((photoIndex, photoElement) => { const src = $(photoElement).attr('src'); - if (src.match(/dl\d+/)) { - // thumbnail URLs containing dl02/ or dl03/ don't appear to have - // a full photo available, fall back to thumbnail - return src; - } - - return src.replace('thumbs/', 'photos/'); + // high res often available in photos/ directory, but not always, provide original as fallback + return [src.replace('thumbs/', 'photos/'), src]; }) .toArray(); @@ -172,8 +166,8 @@ async function scrapeScene(html, url, site) { const photos = await getPhotos(entryId, site); - const rawTags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray(); - const tags = await matchTags(rawTags); + const tags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray(); + const movie = $('.update_dvds a').href; return { url, @@ -184,6 +178,7 @@ async function scrapeScene(html, url, site) { description, poster, photos, + movie, trailer: { src: trailer, quality: 720, diff --git a/src/scrapers/xempire.js b/src/scrapers/xempire.js index 61f663b6..34d46cf7 100644 --- a/src/scrapers/xempire.js +++ b/src/scrapers/xempire.js @@ -6,14 +6,12 @@ const cheerio = require('cheerio'); const { JSDOM } = require('jsdom'); const moment = require('moment'); -const { fetchSites } = require('../sites'); -const { matchTags } = require('../tags'); - const defaultTags = { hardx: [], darkx: ['interracial'], eroticax: [], lesbianx: ['lesbian'], + allblackx: ['ebony', 'bbc'], }; async function fetchPhotos(url) { @@ -25,37 +23,56 @@ async function fetchPhotos(url) { function scrapePhotos(html) { const $ = cheerio.load(html, { normalizeWhitespace: true }); - const unlockedPhotos = $('.preview .imgLink.pgUnlocked') - .map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray(); + return $('.preview .imgLink').toArray().map((linkEl) => { + const url = $(linkEl).attr('href'); - const lockedThumbnails = $('.preview .imgLink.lockedPicture img') - .map((photoIndex, photoElement) => $(photoElement) - .attr('src')) - // .replace('_tb.jpg', '.jpg')) does not always work - .toArray(); + if (url.match('/join')) { + // URL links to join page instead of full photo, extract thumbnail + const src = $(linkEl).find('img').attr('src'); - return unlockedPhotos.concat(lockedThumbnails); + if (src.match('previews/')) { + // resource often serves full photo at a modifier URL anyway, add as primary source + const highRes = src + .replace('previews/', '') + .replace('_tb.jpg', '.jpg'); + + // keep original thumbnail as fallback in case full photo is not available + return [highRes, src]; + } + + return src; + } + + // URL links to full photo + return url; + }); } async function getPhotos(albumPath, siteDomain) { const albumUrl = `https://${siteDomain}${albumPath}`; - const html = await fetchPhotos(albumUrl); - const $ = cheerio.load(html, { normalizeWhitespace: true }); - const photos = scrapePhotos(html); + try { + const html = await fetchPhotos(albumUrl); + const $ = cheerio.load(html, { normalizeWhitespace: true }); + const photos = scrapePhotos(html); - const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray(); + const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray(); - const otherPhotos = await Promise.map(pages, async (page) => { - const pageUrl = `https://${siteDomain}${page}`; - const pageHtml = await fetchPhotos(pageUrl); + const otherPhotos = await Promise.map(pages, async (page) => { + const pageUrl = `https://${siteDomain}${page}`; + const pageHtml = await fetchPhotos(pageUrl); - return scrapePhotos(pageHtml); - }, { - concurrency: 2, - }); + return scrapePhotos(pageHtml); + }, { + concurrency: 2, + }); - return photos.concat(otherPhotos.flat()); + return photos.concat(otherPhotos.flat()); + } catch (error) { + console.error(`Failed to fetch XEmpire photos from ${albumPath}: ${error.message}`); + + return []; + } } function scrape(html, site) { @@ -109,32 +126,26 @@ function scrape(html, site) { async function scrapeScene(html, url, site) { const $ = cheerio.load(html, { normalizeWhitespace: true }); const json = $('script[type="application/ld+json"]').html(); + const json2 = $('script:contains("dataLayer = ")').html(); const videoJson = $('script:contains("window.ScenePlayerOptions")').html(); const data = JSON.parse(json)[0]; + const data2 = JSON.parse(json2.slice(json2.indexOf('[{'), -1))[0]; const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1)); - const entryId = new URL(url).pathname.split('/').slice(-1)[0]; + const entryId = data2.sceneDetails.sceneId || new URL(url).pathname.split('/').slice(-1)[0]; - const title = $('meta[name="twitter:title"]').attr('content'); - const description = data.description || $('meta[name="twitter:description"]').attr('content'); + const title = data2.sceneDetails.sceneTitle || $('meta[name="twitter:title"]').attr('content'); + const description = data2.sceneDetails.sceneDescription || data.description || $('meta[name="twitter:description"]').attr('content'); // date in data object is not the release date of the scene, but the date the entry was added const date = moment.utc($('.updatedDate').first().text(), 'MM-DD-YYYY').toDate(); - const actors = data.actor - .sort(({ gender: genderA }, { gender: genderB }) => { - if (genderA === 'female' && genderB === 'male') return -1; - if (genderA === 'male' && genderB === 'female') return 1; - - return 0; - }) - .map(actor => actor.name); - + const actors = (data2.sceneDetails.sceneActors || data.actor).map(actor => actor.actorName || actor.name); const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5; const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds(); - const siteDomain = $('meta[name="twitter:domain"]').attr('content'); + const siteDomain = $('meta[name="twitter:domain"]').attr('content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available const siteSlug = siteDomain && siteDomain.split('.')[0].toLowerCase(); const siteUrl = siteDomain && `https://www.${siteDomain}`; @@ -144,19 +155,10 @@ async function scrapeScene(html, url, site) { const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain, site); const rawTags = data.keywords.split(', '); - - const [[channelSite], tags] = await Promise.all([ - site.isFallback - ? fetchSites({ - url: siteUrl, - slug: siteSlug, - }) - : [site], - matchTags([...defaultTags[siteSlug], ...rawTags]), - ]); + const tags = [...defaultTags[siteSlug], ...rawTags]; return { - url: channelSite ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}` : url, + url: `${siteUrl}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}`, entryId, title, date, @@ -174,7 +176,8 @@ async function scrapeScene(html, url, site) { rating: { stars, }, - site: channelSite || site, + site, + channel: siteSlug, }; } diff --git a/src/sites.js b/src/sites.js index 5894bf33..d64be7d1 100644 --- a/src/sites.js +++ b/src/sites.js @@ -62,11 +62,14 @@ async function findSiteByUrl(url) { 'sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', ) - .where('sites.url', 'like', `%${domain}%`) + .where('sites.url', 'like', `%${domain}`) + .orWhere('sites.url', url) .first(); if (site) { - return curateSite(site, true); + const curatedSite = curateSite(site, true); + + return curatedSite; } return null;