Refactored 21sextury scraper.

This commit is contained in:
2019-12-09 05:00:49 +01:00
parent d874c508de
commit 04a89efa58
52 changed files with 2621 additions and 2068 deletions

View File

@@ -1,13 +1,15 @@
{
"extends": "airbnb-base",
"parserOptions": {
"parser": "babel-eslint",
"sourceType": "script"
},
"rules": {
"strict": 0,
"no-unused-vars": ["error", {"argsIgnorePattern": "^_"}],
"no-console": 0,
"indent": ["error", 4],
"max-len": [2, {"code": 300, "tabWidth": 4, "ignoreUrls": true}]
"indent": "off",
"max-len": [2, {"code": 300, "tabWidth": 4, "ignoreUrls": true}],
"template-curly-spacing": "off"
}
}

View File

@@ -271,6 +271,8 @@ async function updateActor(actor, scraped = false, scrapeSuccess = false) {
}
async function mergeProfiles(profiles, actor) {
console.log(profiles);
const mergedProfile = profiles.reduce((prevProfile, profile) => {
if (profile === null) {
return prevProfile;

View File

@@ -46,4 +46,4 @@ async function init() {
await initServer();
}
init();
module.exports = init;

View File

@@ -63,6 +63,7 @@ const { argv } = yargs
.option('debug', {
describe: 'Show error stack traces',
type: 'boolean',
default: process.env.NODE_ENV === 'development',
});
module.exports = argv;

4
src/init.js Normal file
View File

@@ -0,0 +1,4 @@
require('babel-polyfill');
const init = require('./app');
init();

View File

@@ -10,6 +10,7 @@ const sharp = require('sharp');
const blake2 = require('blake2');
const knex = require('./knex');
const pluckPhotos = require('./utils/pluck-photos');
function getHash(buffer) {
const hash = blake2.createHash('blake2b', { digestLength: 24 });
@@ -94,10 +95,10 @@ async function filterHashDuplicates(files, domains = ['releases'], roles = ['pho
}
async function fetchPhoto(photoUrl, index, identifier) {
const { pathname } = new URL(photoUrl);
const mimetype = mime.getType(pathname);
try {
const { pathname } = new URL(photoUrl);
const mimetype = mime.getType(pathname);
const res = await bhttp.get(photoUrl);
if (res.statusCode === 200) {
@@ -176,7 +177,11 @@ async function storePhotos(release, releaseId) {
return;
}
const newPhotos = await filterSourceDuplicates(release.photos, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`);
const pluckedPhotos = pluckPhotos(release.photos, release);
console.log(release.photos, pluckedPhotos);
const newPhotos = await filterSourceDuplicates(pluckedPhotos, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`);
if (newPhotos.length === 0) return;

View File

@@ -125,7 +125,7 @@ async function curateScrapedRelease(release) {
likes: release.rating && release.rating.likes,
dislikes: release.rating && release.rating.dislikes,
rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
deep: Boolean(argv.deep && release.url && !release.upcoming),
deep: typeof release.deep === 'boolean' ? release.deep : false,
};
if (release.site.isFallback && release.channel) {
@@ -275,6 +275,12 @@ async function storeRelease(release) {
async function storeReleases(releases) {
const storedReleases = await Promise.map(releases, async (release) => {
if (release.site.isFallback && !release.channel) {
console.error(`Unable to derive channel site from generic URL: ${release.url}.`);
return null;
}
try {
const releaseId = await storeRelease(release);
@@ -289,7 +295,7 @@ async function storeReleases(releases) {
}
}, {
concurrency: 10,
});
}).filter(release => release);
const actors = storedReleases.reduce((acc, release) => {
if (!release.actors) return acc;

View File

@@ -51,7 +51,9 @@ async function scrapeRelease(url, release, deep = false) {
// don't store release when called by site scraper
const [storedRelease] = await storeReleases([scene]);
console.log(`http://${config.web.host}:${config.web.port}/scene/${storedRelease.id}`);
if (storedRelease) {
console.log(`http://${config.web.host}:${config.web.port}/scene/${storedRelease.id}`);
}
}
return scene;

View File

@@ -69,12 +69,20 @@ async function scrapeUpcomingReleases(scraper, site) {
async function deepFetchReleases(baseReleases) {
return Promise.map(baseReleases, async (release) => {
if (release.url) {
const fullRelease = await scrapeRelease(release.url, release, true);
try {
const fullRelease = await scrapeRelease(release.url, release, true);
return {
...release,
...fullRelease,
};
return {
...release,
...fullRelease,
deep: true,
};
} catch (error) {
return {
...release,
deep: false,
};
}
}
return release;
@@ -116,7 +124,7 @@ async function scrapeReleases() {
return await scrapeSiteReleases(scraper, site);
} catch (error) {
if (argv.debug) {
console.error(`${site.id}: Failed to scrape releases`, error);
console.error(`${site.name}: Failed to scrape releases`, error);
}
console.warn(`${site.id}: Failed to scrape releases`);

View File

@@ -1,11 +1,51 @@
'use strict';
const Promise = require('bluebird');
const bhttp = require('bhttp');
const cheerio = require('cheerio');
const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags');
async function fetchPhotos(photoPath) {
const res = await bhttp.get(`https://21sextury.com${photoPath}`);
return res.body.toString();
}
function scrapePhotos(html) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
.map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
.map((photoIndex, photoElement) => $(photoElement)
.attr('src'))
// .replace('_tb.jpg', '.jpg')) does not always work
.toArray();
return unlockedPhotos.concat(lockedThumbnails);
}
async function getPhotos(photoPath) {
if (!photoPath || photoPath.match('join')) {
return [];
}
const html = await fetchPhotos(photoPath);
const $ = cheerio.load(html, { normalizeWhitespace: true });
const photos = scrapePhotos(html);
const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
const otherPhotos = await Promise.map(pages, async (pagePath) => {
const pageHtml = await fetchPhotos(pagePath);
return scrapePhotos(pageHtml);
}, {
concurrency: 2,
});
return photos.concat(otherPhotos.flat());
}
function scrape(html, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
@@ -14,12 +54,13 @@ function scrape(html, site) {
return scenesElements.reduce((accReleases, element) => {
const siteName = $(element).find('.studioName a').attr('title');
if (site.parameters && site.parameters.filter && siteName.toLowerCase() !== site.name.toLowerCase()) {
if (!site.url && siteName.toLowerCase() !== site.name.toLowerCase()) {
// using generic overview as fallback, scene from different site
return accReleases;
}
const sceneLinkElement = $(element).find('.sceneTitle a');
const url = `${site.url}${sceneLinkElement.attr('href')}`;
const url = `${site.url || 'https://www.21sextury.com'}${sceneLinkElement.attr('href')}`;
const title = sceneLinkElement.attr('title').trim();
const entryId = $(element).attr('data-itemid');
@@ -32,6 +73,9 @@ function scrape(html, site) {
.map((actorIndex, actorElement) => $(actorElement).attr('title'))
.toArray();
const poster = $(element).find('.imgLink img').attr('data-original');
const trailer = `https://videothumb.gammacdn.com/307x224/${entryId}.mp4`;
const [likes, dislikes] = $(element).find('.value')
.toArray()
.map(value => Number($(value).text()));
@@ -44,6 +88,10 @@ function scrape(html, site) {
title,
actors,
date,
poster,
trailer: {
src: trailer,
},
rating: {
likes,
dislikes,
@@ -58,25 +106,21 @@ async function scrapeScene(html, url, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const sceneElement = $('#videoWrapper');
const json = $('script[type="application/ld+json"]').html();
const videoJson = $('script:contains("ScenePlayerOptions")').html();
const videoDataString = videoJson.slice(videoJson.indexOf('= {') + 2, videoJson.indexOf('};') + 1);
const data = JSON.parse(json)[0];
const videoData = JSON.parse(videoDataString);
const entryId = new URL(url).pathname.split('/').slice(-1)[0];
const title = data.isPartOf ? data.isPartOf.name : data.name;
const dataDate = moment.utc(data.dateCreated, 'YYYY-MM-DD');
const title = videoData?.playerOptions?.sceneInfos?.sceneTitle || (data.isPartOf && data.isPartOf !== 'TBD' ? data.isPartOf.name : data.name);
const dataDate = moment.utc(videoData?.playerOptions?.sceneInfos?.sceneReleaseDate, 'YYYY-MM-DD');
const date = dataDate.isValid()
? dataDate.toDate()
: moment.utc(sceneElement.find('.updatedDate').text().trim(), 'MM-DD-YYYY').toDate();
const actors = data.actor
.sort(({ gender: genderA }, { gender: genderB }) => {
if (genderA === 'female' && genderB === 'male') return -1;
if (genderA === 'male' && genderB === 'female') return 1;
return 0;
})
.map(actor => actor.name);
const actors = data.actor.map(actor => actor.name);
const description = data.description || null; // prevent empty string
const likes = Number(sceneElement.find('.rating .state_1 .value').text());
@@ -84,27 +128,18 @@ async function scrapeScene(html, url, site) {
const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();
const rawTags = data.keywords.split(', ');
const poster = videoData.picPreview;
const trailer = `${videoData.playerOptions.host}${videoData.url}`;
const photoPath = $('.picturesItem a').attr('href');
const photos = await getPhotos(photoPath, site);
const tags = data.keywords.split(', ');
const siteName = data.productionCompany ? data.productionCompany.name : $('#logoLink a').attr('title');
const siteId = siteName && siteName.replace(/\s+/g, '').toLowerCase();
const [channelSite, tags] = await Promise.all([
site.isFallback
? knex('sites')
.where({ slug: siteId })
.orWhereRaw('name = ? collate NOCASE', [siteName])
.first()
: site,
matchTags(rawTags),
]);
// only replace generic URL with site URL if site is not marked to fetch scenes from generic site
const originalUrl = channelSite && !(channelSite.parameters && JSON.parse(channelSite.parameters).filter)
? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}`
: url;
const channel = siteName && siteName.replace(/\s+/g, '').toLowerCase();
return {
url: originalUrl,
url,
entryId,
title,
date,
@@ -112,22 +147,30 @@ async function scrapeScene(html, url, site) {
description,
duration,
tags,
poster,
photos,
trailer: {
src: trailer,
},
rating: {
likes,
dislikes,
},
site: channelSite || site,
site,
channel,
};
}
async function fetchLatest(site, page = 1) {
const res = await bhttp.get(`${site.parameters && site.parameters.filter ? 'https://21sextury.com' : site.url}/en/videos/All-Categories/0/All-Pornstars/0/latest/${page}`);
const url = `${site.url || 'https://21sextury.com'}/en/videos/All-Categories/0/All-Pornstars/0/latest/${page}`;
const res = await bhttp.get(url);
return scrape(res.body.toString(), site);
}
async function fetchUpcoming(site) {
const res = await bhttp.get(`${site.parameters && site.parameters.filter ? 'https://21sextury.com' : site.url}/en/videos/All-Categories/0/All-Pornstars/0/upcoming`);
const url = `${site.url || 'https://21sextury.com'}/en/videos/All-Categories/0/All-Pornstars/0/upcoming`;
const res = await bhttp.get(url);
return scrape(res.body.toString(), site);
}

View File

@@ -6,7 +6,6 @@ const { JSDOM } = require('jsdom');
const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags');
/* eslint-disable newline-per-chained-call */
function scrapeLatest(html, site) {
@@ -49,13 +48,16 @@ async function scrapeScene(html, url, site) {
const title = $('meta[itemprop="name"]').attr('content');
const description = $('.descr-box p').text(); // meta tags don't contain full description
const date = moment.utc($('meta[itemprop="uploadDate"]').attr('content'), 'YYYY-MM-DD').toDate();
const dateProp = $('meta[itemprop="uploadDate"]').attr('content');
const date = dateProp
? moment.utc($('meta[itemprop="uploadDate"]').attr('content'), 'YYYY-MM-DD').toDate()
: moment.utc($('.title-border:nth-child(2) p').text(), 'MM.DD.YYYY').toDate();
const actors = $('.pornstar-card > a').map((actorIndex, actorElement) => $(actorElement).attr('title')).toArray();
const likes = Number($('.info-panel.likes .likes').text());
const duration = Number($('.info-panel.duration .duration').text().slice(0, -4)) * 60;
const rawTags = $('.tags-tab .tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const tags = $('.tags-tab .tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const poster = $('#video').attr('poster');
const photos = $('.photo-slider-guest .card a').map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
@@ -63,21 +65,7 @@ async function scrapeScene(html, url, site) {
const trailer540 = $('source[res="540"]').attr('src');
const trailer720 = $('source[res="720"]').attr('src');
/*
* broken as of nov 2019
const { origin } = new URL($('.pornstar-card meta[itemprop="url"]').first().attr('content'));
const [channelSite, tags] = await Promise.all([
// don't find site if original is already specific
site.isFallback ? knex('sites').where({ url: origin }).first() : site,
matchTags(rawTags),
]);
*/
const tags = await matchTags(rawTags);
return {
// url: channelSite ? `${channelSite.url}${new URL(url).pathname}` : url,
url,
entryId,
title,
@@ -88,20 +76,19 @@ async function scrapeScene(html, url, site) {
tags,
poster,
photos,
trailer: trailer540
? {
src: trailer540,
quality: 540,
}
: {
// backup
trailer: [
{
src: trailer720,
quality: 720,
},
{
src: trailer540,
quality: 540,
},
],
rating: {
likes,
},
// site: channelSite || site,
site,
};
}

View File

@@ -8,7 +8,6 @@ const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags');
const pluckPhotos = require('../utils/pluck-photos');
async function getPhoto(url) {
const res = await bhttp.get(url);
@@ -20,7 +19,7 @@ async function getPhoto(url) {
return photoUrl;
}
async function getPhotos(albumUrl, site, siteUrl) {
async function getPhotos(albumUrl) {
const res = await bhttp.get(albumUrl);
const html = res.body.toString();
const { document } = new JSDOM(html).window;
@@ -28,15 +27,7 @@ async function getPhotos(albumUrl, site, siteUrl) {
const lastPhotoPage = Array.from(document.querySelectorAll('.preview-image-container a')).slice(-1)[0].href;
const lastPhotoIndex = parseInt(lastPhotoPage.match(/\d+.jpg/)[0], 10);
// dogfart has massive albums, pick 25 or specified number of photos: first, last and evenly inbetween
const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) || 25;
const photoIndexes = pluckPhotos(lastPhotoIndex, photoLimit);
if (photoLimit > 25) {
console.log(`${site.name}: Scraping ${photoLimit} album photos from ${siteUrl}, this may take some time...`);
}
const photoUrls = await Promise.map(photoIndexes, async (index) => {
const photoUrls = await Promise.map(Array.from({ length: lastPhotoIndex }), async (index) => {
const pageUrl = `https://blacksonblondes.com${lastPhotoPage.replace(/\d+.jpg/, `${index.toString().padStart(3, '0')}.jpg`)}`;
return getPhoto(pageUrl);

View File

@@ -9,8 +9,6 @@ const moment = require('moment');
const { heightToCm } = require('../utils/convert');
const { matchTags } = require('../tags');
const pluckPhotos = require('../utils/pluck-photos');
async function fetchPhotos(url) {
const res = await bhttp.get(url);
@@ -58,14 +56,7 @@ async function getPhotos(entryId, site, page = 1) {
})
: [];
const allPhotos = photos.concat(otherPhotos.flat());
const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) || 25;
const photoIndexes = pluckPhotos(allPhotos.length - 1, photoLimit);
const pluckedPhotos = photoIndexes.map(photoIndex => allPhotos[photoIndex]);
return pluckedPhotos;
return photos.concat(otherPhotos.flat());
}
function scrapeLatest(html, site) {

View File

@@ -8,7 +8,6 @@ const moment = require('moment');
const { fetchSites } = require('../sites');
const { matchTags } = require('../tags');
const pluckPhotos = require('../utils/pluck-photos');
const defaultTags = {
hardx: [],
@@ -38,7 +37,7 @@ function scrapePhotos(html) {
return unlockedPhotos.concat(lockedThumbnails);
}
async function getPhotos(albumPath, siteDomain, site) {
async function getPhotos(albumPath, siteDomain) {
const albumUrl = `https://${siteDomain}${albumPath}`;
const html = await fetchPhotos(albumUrl);
@@ -56,14 +55,7 @@ async function getPhotos(albumPath, siteDomain, site) {
concurrency: 2,
});
const allPhotos = photos.concat(otherPhotos.flat());
const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) || 25;
const photoIndexes = pluckPhotos(allPhotos.length - 1, photoLimit);
const pluckedPhotos = photoIndexes.map(photoIndex => allPhotos[photoIndex]);
return pluckedPhotos;
return photos.concat(otherPhotos.flat());
}
function scrape(html, site) {

View File

@@ -1,13 +1,18 @@
'use strict';
const config = require('config');
// pick {photoLimit} photos evenly distributed photos from a set with {photoTotal} photos, return array of indexes starting at 1
function pluckPhotos(photoTotal, photoLimit) {
function pluckPhotos(photos, release, specifiedLimit) {
const limit = specifiedLimit || config.media.limit;
console.log(limit);
const plucked = [1]
.concat(
Array.from({ length: photoLimit - 1 }, (value, index) => Math.round((index + 1) * (photoTotal / (photoLimit - 1)))),
Array.from({ length: limit - 1 }, (value, index) => Math.round((index + 1) * (photos.length / (limit)))),
);
return Array.from(new Set(plucked)); // remove duplicates, may happen when photo total and photo limit are close
return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex]); // remove duplicates, may happen when photo total and photo limit are close
}
module.exports = pluckPhotos;