traxxx/src/scrapers/xempire.js

'use strict';

const Promise = require('bluebird');
const bhttp = require('bhttp');
const cheerio = require('cheerio');
const moment = require('moment');

const knex = require('../knex');
const { matchTags } = require('../tags');
const pluckPhotos = require('../utils/pluck-photos');

const defaultTags = {
    hardx: [],
    darkx: ['interracial'],
    eroticax: [],
    lesbianx: ['lesbian'],
};

async function fetchPhotos(url) {
    const res = await bhttp.get(url);

    return res.body.toString();
}

function scrapePhotos(html) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });

    const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
        .map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();

    const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
        .map((photoIndex, photoElement) => $(photoElement)
            .attr('src'))
        // .replace('_tb.jpg', '.jpg')) does not always work
        .toArray();

    return unlockedPhotos.concat(lockedThumbnails);
}

async function getPhotos(albumPath, siteDomain, site) {
    const albumUrl = `https://${siteDomain}${albumPath}`;

    const html = await fetchPhotos(albumUrl);
    const $ = cheerio.load(html, { normalizeWhitespace: true });
    const photos = scrapePhotos(html);

    const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();

    const otherPhotos = await Promise.map(pages, async (page) => {
        const pageUrl = `https://${siteDomain}${page}`;
        const pageHtml = await fetchPhotos(pageUrl);

        return scrapePhotos(pageHtml);
    }, {
        concurrency: 2,
    });

    const allPhotos = photos.concat(otherPhotos.flat());

    const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) || 25;
    const photoIndexes = pluckPhotos(allPhotos.length - 1, photoLimit);

    const pluckedPhotos = photoIndexes.map(photoIndex => allPhotos[photoIndex]);

    return pluckedPhotos;
}

function scrape(html, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
    const scenesElements = $('li[data-itemtype=scene]').toArray();

    return scenesElements.map((element) => {
        const sceneLinkElement = $(element).find('.sceneTitle a');

        const url = `${site.url}${sceneLinkElement.attr('href')}`;
        const title = sceneLinkElement.attr('title');

        const entryId = $(element).attr('data-itemid');

        const date = moment
            .utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY')
            .toDate();

        const actors = $(element).find('.sceneActors a')
            .map((actorIndex, actorElement) => $(actorElement).attr('title'))
            .toArray();

        const [likes, dislikes] = $(element).find('.value')
            .toArray()
            .map(value => Number($(value).text()));

        const poster = $(element).find('.imgLink img').attr('data-original');
        const trailer = `https://videothumb.gammacdn.com/307x224/${entryId}.mp4`;

        return {
            url,
            entryId,
            title,
            actors,
            director: 'Mason',
            date,
            poster,
            trailer: {
                src: trailer,
                quality: 224,
            },
            rating: {
                likes,
                dislikes,
            },
            site,
        };
    });
}

async function scrapeScene(html, url, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
    const json = $('script[type="application/ld+json"]').html();
    const videoJson = $('script:contains("window.ScenePlayerOptions")').html();

    const data = JSON.parse(json)[0];
    const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1));

    const entryId = new URL(url).pathname.split('/').slice(-1)[0];

    const title = $('meta[name="twitter:title"]').attr('content');
    const description = data.description || $('meta[name="twitter:description"]').attr('content');
    // date in data object is not the release date of the scene, but the date the entry was added
    const date = moment.utc($('.updatedDate').first().text(), 'MM-DD-YYYY').toDate();

    const actors = data.actor
        .sort(({ gender: genderA }, { gender: genderB }) => {
            if (genderA === 'female' && genderB === 'male') return -1;
            if (genderA === 'male' && genderB === 'female') return 1;

            return 0;
        })
        .map(actor => actor.name);

    const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5;

    const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();

    const siteDomain = $('meta[name="twitter:domain"]').attr('content');
    const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase();
    const siteUrl = siteDomain && `https://www.${siteDomain}`;

    const poster = videoData.picPreview;
    const trailer = `${videoData.playerOptions.host}${videoData.url}`;

    const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain, site);

    const rawTags = data.keywords.split(', ');

    const [channelSite, tags] = await Promise.all([
        site.isFallback
            ? knex('sites')
                .where({ url: siteUrl })
                .orWhere({ slug: siteId })
                .first()
            : site,
        matchTags([...defaultTags[siteId], ...rawTags]),
    ]);

    return {
        url: channelSite ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}` : url,
        entryId,
        title,
        date,
        actors,
        director: 'Mason',
        description,
        duration,
        poster,
        photos,
        trailer: {
            src: trailer,
            quality: parseInt(videoData.sizeOnLoad, 10),
        },
        tags,
        rating: {
            stars,
        },
        site: channelSite || site,
    };
}

async function fetchLatest(site, page = 1) {
    const res = await bhttp.get(`${site.url}/en/videos/AllCategories/0/${page}`);

    return scrape(res.body.toString(), site);
}

async function fetchUpcoming(site) {
    const res = await bhttp.get(`${site.url}/en/videos/AllCategories/0/1/upcoming`);

    return scrape(res.body.toString(), site);
}

async function fetchScene(url, site) {
    const res = await bhttp.get(url);

    return scrapeScene(res.body.toString(), url, site);
}

module.exports = {
    fetchLatest,
    fetchUpcoming,
    fetchScene,
};
Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00			`'use strict';`

Added media support to XEmpire (HardX) scraper. 2019-10-29 02:13:56 +00:00			`const Promise = require('bluebird');`
Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00			`const bhttp = require('bhttp');`
			`const cheerio = require('cheerio');`
Completed Perv City scraper. Outputting results as list. 2019-03-04 03:19:03 +00:00			`const moment = require('moment');`
Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00
Major refactor, cleand up site scrape module, fixed and cleaned up release scrape module. Removed old CLI code 2019-11-16 02:33:36 +00:00			`const knex = require('../knex');`
Replaced network and tag files with SQLite database. 2019-03-25 02:57:33 +00:00			`const { matchTags } = require('../tags');`
Fixed countries seed file. Updated MOFOS scraper. Improved Reality Kings scraper. Limiting photos for XEmpire scraper. 2019-11-27 03:58:38 +00:00			`const pluckPhotos = require('../utils/pluck-photos');`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00
Added tag filter to releases query, enabled on homepage. 2019-11-13 02:14:24 +00:00			`const defaultTags = {`
			`hardx: [],`
			`darkx: ['interracial'],`
			`eroticax: [],`
			`lesbianx: ['lesbian'],`
			`};`

Added media support to XEmpire (HardX) scraper. 2019-10-29 02:13:56 +00:00			`async function fetchPhotos(url) {`
			`const res = await bhttp.get(url);`

			`return res.body.toString();`
			`}`

			`function scrapePhotos(html) {`
			`const $ = cheerio.load(html, { normalizeWhitespace: true });`

			`const unlockedPhotos = $('.preview .imgLink.pgUnlocked')`
			`.map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();`

			`const lockedThumbnails = $('.preview .imgLink.lockedPicture img')`
			`.map((photoIndex, photoElement) => $(photoElement)`
Improved site and network pages. Fixed various issues. 2019-11-12 00:22:20 +00:00			`.attr('src'))`
			`// .replace('_tb.jpg', '.jpg')) does not always work`
Added media support to XEmpire (HardX) scraper. 2019-10-29 02:13:56 +00:00			`.toArray();`

			`return unlockedPhotos.concat(lockedThumbnails);`
			`}`

Fixed countries seed file. Updated MOFOS scraper. Improved Reality Kings scraper. Limiting photos for XEmpire scraper. 2019-11-27 03:58:38 +00:00			`async function getPhotos(albumPath, siteDomain, site) {`
Added media support to XEmpire (HardX) scraper. 2019-10-29 02:13:56 +00:00			const albumUrl = `https://${siteDomain}${albumPath}`;

			`const html = await fetchPhotos(albumUrl);`
			`const $ = cheerio.load(html, { normalizeWhitespace: true });`
			`const photos = scrapePhotos(html);`

			`const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();`

			`const otherPhotos = await Promise.map(pages, async (page) => {`
			const pageUrl = `https://${siteDomain}${page}`;
			`const pageHtml = await fetchPhotos(pageUrl);`

			`return scrapePhotos(pageHtml);`
			`}, {`
			`concurrency: 2,`
			`});`

Fixed countries seed file. Updated MOFOS scraper. Improved Reality Kings scraper. Limiting photos for XEmpire scraper. 2019-11-27 03:58:38 +00:00			`const allPhotos = photos.concat(otherPhotos.flat());`

			`const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) \|\| 25;`
			`const photoIndexes = pluckPhotos(allPhotos.length - 1, photoLimit);`

			`const pluckedPhotos = photoIndexes.map(photoIndex => allPhotos[photoIndex]);`

			`return pluckedPhotos;`
Added media support to XEmpire (HardX) scraper. 2019-10-29 02:13:56 +00:00			`}`

Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00			`function scrape(html, site) {`
			`const $ = cheerio.load(html, { normalizeWhitespace: true });`
Added parameters column to sites database, fixes Perv City scraper. Getting shoot ID from all existing scrapers. 2019-03-26 00:26:47 +00:00			`const scenesElements = $('li[data-itemtype=scene]').toArray();`
Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00
			`return scenesElements.map((element) => {`
			`const sceneLinkElement = $(element).find('.sceneTitle a');`
Added media support to XEmpire (HardX) scraper. 2019-10-29 02:13:56 +00:00
Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00			const url = `${site.url}${sceneLinkElement.attr('href')}`;
			`const title = sceneLinkElement.attr('title');`

Added 21Sextury scraper. Various improvements. 2019-04-07 18:51:14 +00:00			`const entryId = $(element).attr('data-itemid');`
Added parameters column to sites database, fixes Perv City scraper. Getting shoot ID from all existing scrapers. 2019-03-26 00:26:47 +00:00
Added Jules Jordan scraper. Added render argument to stop going into ncurses mode during dev. 2019-03-18 03:46:53 +00:00			`const date = moment`
			`.utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY')`
			`.toDate();`
Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00
			`const actors = $(element).find('.sceneActors a')`
			`.map((actorIndex, actorElement) => $(actorElement).attr('title'))`
			`.toArray();`

Added Jules Jordan scraper. Added render argument to stop going into ncurses mode during dev. 2019-03-18 03:46:53 +00:00			`const [likes, dislikes] = $(element).find('.value')`
			`.toArray()`
			`.map(value => Number($(value).text()));`
Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00
Added media support to XEmpire (HardX) scraper. 2019-10-29 02:13:56 +00:00			`const poster = $(element).find('.imgLink img').attr('data-original');`
			const trailer = `https://videothumb.gammacdn.com/307x224/${entryId}.mp4`;

Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00			`return {`
			`url,`
Added 21Sextury scraper. Various improvements. 2019-04-07 18:51:14 +00:00			`entryId,`
Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00			`title,`
			`actors,`
Added Private scraper. Added Vixen scraper to repository. 2019-04-04 02:00:28 +00:00			`director: 'Mason',`
Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00			`date,`
Added media support to XEmpire (HardX) scraper. 2019-10-29 02:13:56 +00:00			`poster,`
			`trailer: {`
			`src: trailer,`
			`quality: 224,`
			`},`
Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00			`rating: {`
			`likes,`
			`dislikes,`
			`},`
Displaying results in terminal table with select, scroll and experimental search. Expanded broken character handling for PervCity scraper. 2019-03-11 03:19:36 +00:00			`site,`
Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00			`};`
			`});`
			`}`

Replaced network and tag files with SQLite database. 2019-03-25 02:57:33 +00:00			`async function scrapeScene(html, url, site) {`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00			`const $ = cheerio.load(html, { normalizeWhitespace: true });`
			`const json = $('script[type="application/ld+json"]').html();`
Added media support to XEmpire (HardX) scraper. 2019-10-29 02:13:56 +00:00			`const videoJson = $('script:contains("window.ScenePlayerOptions")').html();`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00
			`const data = JSON.parse(json)[0];`
Added media support to XEmpire (HardX) scraper. 2019-10-29 02:13:56 +00:00			`const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1));`

Added 21Sextury scraper. Various improvements. 2019-04-07 18:51:14 +00:00			`const entryId = new URL(url).pathname.split('/').slice(-1)[0];`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00
Scrolling content container instead of whole page. Improved title source in XEmpire scraper. Fixed scene page breaking when no photos are available. 2019-09-24 23:38:06 +00:00			`const title = $('meta[name="twitter:title"]').attr('content');`
Improved Vixen and XEmpire scrapers. Added media to Blowpass scraper. Improved release fetch code. 2019-09-26 01:27:01 +00:00			`const description = data.description \|\| $('meta[name="twitter:description"]').attr('content');`
			`// date in data object is not the release date of the scene, but the date the entry was added`
			`const date = moment.utc($('.updatedDate').first().text(), 'MM-DD-YYYY').toDate();`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00
			`const actors = data.actor`
Added Reality Kings scraper. Improved site finder. 2019-04-07 23:49:45 +00:00			`.sort(({ gender: genderA }, { gender: genderB }) => {`
			`if (genderA === 'female' && genderB === 'male') return -1;`
			`if (genderA === 'male' && genderB === 'female') return 1;`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00
			`return 0;`
			`})`
			`.map(actor => actor.name);`

			`const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5;`

Added Kink scraper with elaborate site specification. 2019-03-24 04:28:18 +00:00			`const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00
Added 21Sextury scraper. Various improvements. 2019-04-07 18:51:14 +00:00			`const siteDomain = $('meta[name="twitter:domain"]').attr('content');`
			`const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase();`
			const siteUrl = siteDomain && `https://www.${siteDomain}`;

Added media support to XEmpire (HardX) scraper. 2019-10-29 02:13:56 +00:00			`const poster = videoData.picPreview;`
			const trailer = `${videoData.playerOptions.host}${videoData.url}`;

Fixed countries seed file. Updated MOFOS scraper. Improved Reality Kings scraper. Limiting photos for XEmpire scraper. 2019-11-27 03:58:38 +00:00			`const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain, site);`
Added media support to XEmpire (HardX) scraper. 2019-10-29 02:13:56 +00:00
Added Dogfart scraper. Added 'date added' property to release page. 2019-11-04 04:47:37 +00:00			`const rawTags = data.keywords.split(', ');`

Added 21Sextury scraper. Various improvements. 2019-04-07 18:51:14 +00:00			`const [channelSite, tags] = await Promise.all([`
			`site.isFallback`
			`? knex('sites')`
			`.where({ url: siteUrl })`
Added Dogfart scraper. Added 'date added' property to release page. 2019-11-04 04:47:37 +00:00			`.orWhere({ slug: siteId })`
Added 21Sextury scraper. Various improvements. 2019-04-07 18:51:14 +00:00			`.first()`
			`: site,`
Added tag filter to releases query, enabled on homepage. 2019-11-13 02:14:24 +00:00			`matchTags([...defaultTags[siteId], ...rawTags]),`
Added 21Sextury scraper. Various improvements. 2019-04-07 18:51:14 +00:00			`]);`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00
Improved module structure. Added individual scene scrapers for Jules Jordan and XEmpire. 2019-03-23 21:48:39 +00:00			`return {`
Added 21Sextury scraper. Various improvements. 2019-04-07 18:51:14 +00:00			url: channelSite ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}` : url,
			`entryId,`
Improved module structure. Added individual scene scrapers for Jules Jordan and XEmpire. 2019-03-23 21:48:39 +00:00			`title,`
			`date,`
			`actors,`
Added Private scraper. Added Vixen scraper to repository. 2019-04-04 02:00:28 +00:00			`director: 'Mason',`
Improved module structure. Added individual scene scrapers for Jules Jordan and XEmpire. 2019-03-23 21:48:39 +00:00			`description,`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00			`duration,`
Added media support to XEmpire (HardX) scraper. 2019-10-29 02:13:56 +00:00			`poster,`
			`photos,`
			`trailer: {`
			`src: trailer,`
			`quality: parseInt(videoData.sizeOnLoad, 10),`
			`},`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00			`tags,`
Improved module structure. Added individual scene scrapers for Jules Jordan and XEmpire. 2019-03-23 21:48:39 +00:00			`rating: {`
			`stars,`
			`},`
Added 21Sextury scraper. Various improvements. 2019-04-07 18:51:14 +00:00			`site: channelSite \|\| site,`
Improved module structure. Added individual scene scrapers for Jules Jordan and XEmpire. 2019-03-23 21:48:39 +00:00			`};`
			`}`

Scrapers can now iterate through pages. Filtering unique releases before saving to database. Improved scrapers and rendering. 2019-04-05 01:45:40 +00:00			`async function fetchLatest(site, page = 1) {`
			const res = await bhttp.get(`${site.url}/en/videos/AllCategories/0/${page}`);
Improved module structure. Added individual scene scrapers for Jules Jordan and XEmpire. 2019-03-23 21:48:39 +00:00
Scrapers can now iterate through pages. Filtering unique releases before saving to database. Improved scrapers and rendering. 2019-04-05 01:45:40 +00:00			`return scrape(res.body.toString(), site);`
Improved module structure. Added individual scene scrapers for Jules Jordan and XEmpire. 2019-03-23 21:48:39 +00:00			`}`

			`async function fetchUpcoming(site) {`
			const res = await bhttp.get(`${site.url}/en/videos/AllCategories/0/1/upcoming`);

			`return scrape(res.body.toString(), site);`
			`}`

			`async function fetchScene(url, site) {`
			`const res = await bhttp.get(url);`
Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00
Improved module structure. Added individual scene scrapers for Jules Jordan and XEmpire. 2019-03-23 21:48:39 +00:00			`return scrapeScene(res.body.toString(), url, site);`
Added include and exclude config options. Moved network definitions to dedicated file. Scraping all XEmpire sites. Added definitions for Perv City. 2019-03-04 01:46:33 +00:00			`}`

Improved module structure. Added individual scene scrapers for Jules Jordan and XEmpire. 2019-03-23 21:48:39 +00:00			`module.exports = {`
			`fetchLatest,`
			`fetchUpcoming,`
			`fetchScene,`
			`};`