traxxx/src/scrapers/legalporno.js

'use strict';

const bhttp = require('bhttp');
const cheerio = require('cheerio');
const moment = require('moment');

const { matchTags } = require('../tags');

function extractTitle(originalTitle) {
    const titleComponents = originalTitle.split(' ');
    const sceneIdMatch = titleComponents.slice(-1)[0].match(/(AB|AF|GP|SZ|IV|GIO|RS|TW|MA|FM|SAL|NR|AA|GL|BZ|FS)\d+/); // detect studio prefixes
    const shootId = sceneIdMatch ? sceneIdMatch[0] : null;
    const title = sceneIdMatch ? titleComponents.slice(0, -1).join(' ') : originalTitle;

    return { shootId, title };
}

function scrapeLatest(html, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
    const scenesElements = $('.thumbnails > div').toArray();

    return scenesElements.map((element) => {
        const sceneLinkElement = $(element).find('.thumbnail-title a');
        const url = sceneLinkElement.attr('href');

        const originalTitle = sceneLinkElement.text().trim(); // title attribute breaks when they use \\ escaping
        const { shootId, title } = extractTitle(originalTitle);
        const entryId = new URL(url).pathname.split('/')[2];

        const date = moment.utc($(element).attr('release'), 'YYYY/MM/DD').toDate();

        return {
            url,
            shootId,
            entryId,
            title,
            date,
            site,
        };
    });
}

async function scrapeScene(html, url, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });

    const originalTitle = $('h1.watchpage-title').text().trim();
    const { shootId, title } = extractTitle(originalTitle);
    const entryId = new URL(url).pathname.split('/')[2];

    const date = moment.utc($('span[title="Release date"] a').text(), 'YYYY-MM-DD').toDate();

    const [actorsElement, tagsElement] = $('.scene-description__row').toArray();
    const actors = $(actorsElement)
        .find('a[href*="com/model"]')
        .map((actorIndex, actorElement) => $(actorElement).text()).toArray();

    const duration = moment.duration($('span[title="Runtime"]').text().trim()).asSeconds();

    const rawTags = $(tagsElement).find('a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
    const tags = await matchTags(rawTags);

    return {
        url,
        shootId,
        entryId,
        title,
        date,
        actors,
        duration,
        tags,
        site,
    };
}

async function fetchLatest(site, page = 1) {
    const res = await bhttp.get(`${site.url}/new-videos/${page}`);

    return scrapeLatest(res.body.toString(), site);
}

async function fetchScene(url, site) {
    const res = await bhttp.get(url);

    return scrapeScene(res.body.toString(), url, site);
}

module.exports = {
    fetchLatest,
    fetchScene,
};
Improved module structure. Added individual scene scrapers for Jules Jordan and XEmpire. 2019-03-23 21:48:39 +00:00			`'use strict';`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00
			`const bhttp = require('bhttp');`
			`const cheerio = require('cheerio');`
			`const moment = require('moment');`

Replaced network and tag files with SQLite database. 2019-03-25 02:57:33 +00:00			`const { matchTags } = require('../tags');`

Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00			`function extractTitle(originalTitle) {`
			`const titleComponents = originalTitle.split(' ');`
Legalporno title scene ID fix Old regex would fail on some videos e.g. Little Sofi Smile plays with two big cocks AF001 Whitney Wright Gets Punished by Two Big Black Cocks AB015 2019-09-23 16:32:47 +00:00			`const sceneIdMatch = titleComponents.slice(-1)[0].match(/(AB\|AF\|GP\|SZ\|IV\|GIO\|RS\|TW\|MA\|FM\|SAL\|NR\|AA\|GL\|BZ\|FS)\d+/); // detect studio prefixes`
Added parameters column to sites database, fixes Perv City scraper. Getting shoot ID from all existing scrapers. 2019-03-26 00:26:47 +00:00			`const shootId = sceneIdMatch ? sceneIdMatch[0] : null;`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00			`const title = sceneIdMatch ? titleComponents.slice(0, -1).join(' ') : originalTitle;`

Added parameters column to sites database, fixes Perv City scraper. Getting shoot ID from all existing scrapers. 2019-03-26 00:26:47 +00:00			`return { shootId, title };`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00			`}`

			`function scrapeLatest(html, site) {`
			`const $ = cheerio.load(html, { normalizeWhitespace: true });`
			`const scenesElements = $('.thumbnails > div').toArray();`

			`return scenesElements.map((element) => {`
			`const sceneLinkElement = $(element).find('.thumbnail-title a');`
			`const url = sceneLinkElement.attr('href');`

Scrapers can now iterate through pages. Filtering unique releases before saving to database. Improved scrapers and rendering. 2019-04-05 01:45:40 +00:00			`const originalTitle = sceneLinkElement.text().trim(); // title attribute breaks when they use \\ escaping`
Added parameters column to sites database, fixes Perv City scraper. Getting shoot ID from all existing scrapers. 2019-03-26 00:26:47 +00:00			`const { shootId, title } = extractTitle(originalTitle);`
Added Blowpass scraper. Split shootId and pageId. 2019-04-06 21:24:26 +00:00			`const entryId = new URL(url).pathname.split('/')[2];`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00
			`const date = moment.utc($(element).attr('release'), 'YYYY/MM/DD').toDate();`

			`return {`
			`url,`
Added Blowpass scraper. Split shootId and pageId. 2019-04-06 21:24:26 +00:00			`shootId,`
			`entryId,`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00			`title,`
			`date,`
			`site,`
			`};`
			`});`
			`}`

Replaced network and tag files with SQLite database. 2019-03-25 02:57:33 +00:00			`async function scrapeScene(html, url, site) {`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00			`const $ = cheerio.load(html, { normalizeWhitespace: true });`

			`const originalTitle = $('h1.watchpage-title').text().trim();`
Added parameters column to sites database, fixes Perv City scraper. Getting shoot ID from all existing scrapers. 2019-03-26 00:26:47 +00:00			`const { shootId, title } = extractTitle(originalTitle);`
Added Blowpass scraper. Split shootId and pageId. 2019-04-06 21:24:26 +00:00			`const entryId = new URL(url).pathname.split('/')[2];`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00
			`const date = moment.utc($('span[title="Release date"] a').text(), 'YYYY-MM-DD').toDate();`

			`const [actorsElement, tagsElement] = $('.scene-description__row').toArray();`
			`const actors = $(actorsElement)`
			`.find('a[href*="com/model"]')`
			`.map((actorIndex, actorElement) => $(actorElement).text()).toArray();`

Added Kink scraper with elaborate site specification. 2019-03-24 04:28:18 +00:00			`const duration = moment.duration($('span[title="Runtime"]').text().trim()).asSeconds();`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00
			`const rawTags = $(tagsElement).find('a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();`
Replaced network and tag files with SQLite database. 2019-03-25 02:57:33 +00:00			`const tags = await matchTags(rawTags);`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00
			`return {`
			`url,`
Added parameters column to sites database, fixes Perv City scraper. Getting shoot ID from all existing scrapers. 2019-03-26 00:26:47 +00:00			`shootId,`
Added Blowpass scraper. Split shootId and pageId. 2019-04-06 21:24:26 +00:00			`entryId,`
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00			`title,`
			`date,`
			`actors,`
			`duration,`
			`tags,`
			`site,`
			`};`
			`}`

Scrapers can now iterate through pages. Filtering unique releases before saving to database. Improved scrapers and rendering. 2019-04-05 01:45:40 +00:00			`async function fetchLatest(site, page = 1) {`
			const res = await bhttp.get(`${site.url}/new-videos/${page}`);
Added tags and duration to scraping. Added LegalPorno scraper. 2019-03-24 00:29:22 +00:00
			`return scrapeLatest(res.body.toString(), site);`
			`}`

			`async function fetchScene(url, site) {`
			`const res = await bhttp.get(url);`

			`return scrapeScene(res.body.toString(), url, site);`
			`}`

			`module.exports = {`
			`fetchLatest,`
			`fetchScene,`
			`};`