Added q scraping helper. Added Perfect Gonzo scraper.

This commit is contained in:
ThePendulum 2020-01-16 21:04:44 +01:00
parent 5dda81535d
commit 61fee5e4f6
23 changed files with 355 additions and 28 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 788 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

View File

@ -113,6 +113,12 @@ const networks = [
url: 'https://www.naughtyamerica.com', url: 'https://www.naughtyamerica.com',
description: 'The best porn movies daily at Naughty America! Experience the most seductive porn stars in stunning virtual reality, 4K and HD porn videos!', description: 'The best porn movies daily at Naughty America! Experience the most seductive porn stars in stunning virtual reality, 4K and HD porn videos!',
}, },
{
slug: 'perfectgonzo',
name: 'Perfect Gonzo',
url: 'https://www.perfectgonzo.com',
description: '',
},
{ {
slug: 'pervcity', slug: 'pervcity',
name: 'Perv City', name: 'Perv City',

View File

@ -2078,6 +2078,67 @@ function getSites(networksMap) {
url: 'https://www.naughtyamerica.com/site/live-naughty-nurse', url: 'https://www.naughtyamerica.com/site/live-naughty-nurse',
network_id: networksMap.naughtyamerica, network_id: networksMap.naughtyamerica,
}, },
// PERFECT GONZO
{
slug: 'allinternal',
name: 'All Internal',
url: 'https://allinternal.com',
network_id: networksMap.perfectgonzo,
},
{
slug: 'asstraffic',
name: 'Ass Traffic',
url: 'https://asstraffic.com',
network_id: networksMap.perfectgonzo,
},
{
slug: 'cumforcover',
name: 'Cum For Cover',
url: 'https://cumforcover.com',
network_id: networksMap.perfectgonzo,
},
{
slug: 'fistflush',
name: 'Fist Flush',
url: 'https://fistflush.com',
network_id: networksMap.perfectgonzo,
},
{
slug: 'givemepink',
name: 'Give Me Pink',
url: 'https://givemepink.com',
network_id: networksMap.perfectgonzo,
},
{
slug: 'milfthing',
name: 'MILF Thing',
url: 'https://milfthing.com',
network_id: networksMap.perfectgonzo,
},
{
slug: 'primecups',
name: 'Prime Cups',
url: 'https://primecups.com',
network_id: networksMap.perfectgonzo,
},
{
slug: 'purepov',
name: 'Pure POV',
url: 'https://purepov.com',
network_id: networksMap.perfectgonzo,
},
{
slug: 'spermswap',
name: 'Sperm Swap',
url: 'https://spermswap.com',
network_id: networksMap.perfectgonzo,
},
{
slug: 'tamedteens',
name: 'Tamed Teens',
url: 'https://tamedteens.com',
network_id: networksMap.perfectgonzo,
},
// PERVCITY // PERVCITY
{ {
slug: 'analoverdose', slug: 'analoverdose',

View File

@ -296,6 +296,11 @@ function getTags(groupsMap) {
slug: 'cum-on-boobs', slug: 'cum-on-boobs',
alias_for: null, alias_for: null,
}, },
{
name: 'cum swapping',
slug: 'cum-swapping',
alias_for: null,
},
{ {
name: 'cumshot', name: 'cumshot',
slug: 'cumshot', slug: 'cumshot',
@ -756,6 +761,11 @@ function getTags(groupsMap) {
alias_for: null, alias_for: null,
group_id: groupsMap.clothing, group_id: groupsMap.clothing,
}, },
{
name: 'solo',
slug: 'solo',
alias_for: null,
},
{ {
name: 'spanking', name: 'spanking',
slug: 'spanking', slug: 'spanking',
@ -1120,6 +1130,10 @@ function getTagAliases(tagsMap) {
name: 'creampies', name: 'creampies',
alias_for: tagsMap.creampie, alias_for: tagsMap.creampie,
}, },
{
name: 'creampie - anal',
alias_for: tagsMap['anal-creampie'],
},
{ {
name: 'crop', // a type of whip, not [sic] short for corporal name: 'crop', // a type of whip, not [sic] short for corporal
alias_for: tagsMap['corporal-punishment'], alias_for: tagsMap['corporal-punishment'],
@ -1188,6 +1202,10 @@ function getTagAliases(tagsMap) {
name: 'doggystyle - standing', name: 'doggystyle - standing',
alias_for: tagsMap['standing-doggy-style'], alias_for: tagsMap['standing-doggy-style'],
}, },
{
name: 'doggystyle regular',
alias_for: tagsMap['doggy-style'],
},
{ {
name: 'dom', name: 'dom',
alias_for: tagsMap.bdsm, alias_for: tagsMap.bdsm,
@ -1536,6 +1554,10 @@ function getTagAliases(tagsMap) {
name: 'teens', name: 'teens',
alias_for: tagsMap.teen, alias_for: tagsMap.teen,
}, },
{
name: 'throat fucking',
alias_for: tagsMap.facefucking,
},
{ {
name: 'tiny boobs', name: 'tiny boobs',
alias_for: tagsMap['small-boobs'], alias_for: tagsMap['small-boobs'],
@ -1598,12 +1620,14 @@ function getSiteTags() {
dpparodies: ['parody'], dpparodies: ['parody'],
eighteenyearsold: ['teen'], eighteenyearsold: ['teen'],
exotic4k: ['4k'], exotic4k: ['4k'],
givemepink: ['solo', 'masturbation'],
lubed: ['oil'], lubed: ['oil'],
familystrokes: ['family'], familystrokes: ['family'],
massagecreep: ['massage'], massagecreep: ['massage'],
menonedge: ['gay'], menonedge: ['gay'],
povd: ['pov'], povd: ['pov'],
puremature: ['milf'], puremature: ['milf'],
spermswap: ['cum-swapping'],
spyfam: ['family'], spyfam: ['family'],
submissived: ['bdsm'], submissived: ['bdsm'],
swallowed: ['blowjob', 'deepthroat', 'facefucking'], swallowed: ['blowjob', 'deepthroat', 'facefucking'],

View File

@ -52,8 +52,10 @@ async function scrapeScene(scene, site, tokens) {
entryId: scene.id, entryId: scene.id,
title: scene.title, title: scene.title,
duration: scene.length, duration: scene.length,
tokens, // attach tokens to reduce number of requests required for deep fetching
site, site,
meta: {
tokens, // attach tokens to reduce number of requests required for deep fetching
},
}; };
release.url = `${site.url}/scene/${release.entryId}/${slugify(release.title, true)}`; release.url = `${site.url}/scene/${release.entryId}/${slugify(release.title, true)}`;
@ -93,7 +95,7 @@ async function fetchLatest(site, page = 1) {
} }
async function fetchScene(url, site, release) { async function fetchScene(url, site, release) {
const { time, token } = release?.tokens || await fetchToken(site); // use attached tokens when deep fetching const { time, token } = release?.meta.tokens || await fetchToken(site); // use attached tokens when deep fetching
const { pathname } = new URL(url); const { pathname } = new URL(url);
const entryId = pathname.split('/')[2]; const entryId = pathname.split('/')[2];

View File

@ -0,0 +1,149 @@
'use strict';
const bhttp = require('bhttp');
const blake2 = require('blake2');
const knex = require('../knex');
const { ex, ctxa } = require('../utils/q');
async function getSiteSlugs() {
return knex('sites')
.pluck('sites.slug')
.join('networks', 'networks.id', 'sites.network_id')
.where('networks.slug', 'perfectgonzo');
}
function getHash(identifier) {
const hash = blake2.createHash('blake2b', { digestLength: 8 });
hash.update(Buffer.from(identifier));
return hash.digest('hex');
}
function extractMaleModelsFromTags(tagContainer) {
if (!tagContainer) {
return [];
}
const tagEls = Array.from(tagContainer.childNodes, node => ({ type: node.nodeType, text: node.textContent.trim() })).filter(node => node.text.length > 0);
const modelLabelIndex = tagEls.findIndex(node => node.text === 'Male Models');
if (modelLabelIndex > -1) {
const nextLabelIndex = tagEls.findIndex((node, index) => index > modelLabelIndex && node.type === 3);
const maleModels = tagEls.slice(modelLabelIndex + 1, nextLabelIndex);
return maleModels.map(model => model.text);
}
return [];
}
async function extractChannelFromPhoto(photo, metaSiteSlugs) {
const siteSlugs = metaSiteSlugs || await getSiteSlugs();
const channelMatch = photo.match(new RegExp(siteSlugs.join('|')));
if (channelMatch) {
return channelMatch[0];
}
return null;
}
async function scrapeLatest(html, site) {
const siteSlugs = await getSiteSlugs();
const { element } = ex(html);
return ctxa(element, '#content-main .itemm').map(({
q, qa, qlength, qdate, qimages,
}) => {
const release = {
site,
meta: {
siteSlugs,
},
};
const sceneLink = q('a');
release.title = sceneLink.title;
release.url = `${site.url}${sceneLink.href}`;
release.date = qdate('.nm-date', 'MM/DD/YYYY');
const slug = new URL(release.url).pathname.split('/')[2];
release.entryId = getHash(`${site.slug}${slug}${release.date.toISOString()}`);
release.actors = release.title.split('&').map(actor => actor.trim());
[release.poster, ...release.photos] = qimages('.bloc-link img');
release.tags = qa('.dropdown ul a', true).slice(1);
release.duration = qlength('.dropdown p:first-child');
return release;
});
}
async function scrapeScene(html, site, url, metaSiteSlugs) {
const {
q, qa, qlength, qdate, qposter, qtrailer,
} = ex(html);
const release = { url, site };
release.title = q('#movie-header h2', true);
release.date = qdate('#movie-header div span', 'MMMM DD, YYYY', /\w+ \d{1,2}, \d{4}/);
release.description = q('.container .mg-md', true);
release.duration = qlength('#video-ribbon .container > div > span:nth-child(3)');
release.actors = qa('#video-info a', true).concat(extractMaleModelsFromTags(q('.tag-container')));
release.tags = qa('.tag-container a', true);
const uhd = q('#video-ribbon .container > div > span:nth-child(2)', true);
if (/4K/.test(uhd)) release.tags = release.tags.concat('4k');
release.photos = qa('.bxslider_pics img').map(el => el.dataset.original || el.src);
release.poster = qposter();
const trailer = qtrailer();
if (trailer) release.trailer = { src: trailer };
if (release.photos.length > 0) release.channel = await extractChannelFromPhoto(release.photos[0], metaSiteSlugs);
if (release.channel) {
const { pathname } = new URL(url);
release.url = `https://${release.channel}.com${pathname}`;
const slug = pathname.split('/')[2];
release.entryId = getHash(`${release.channel}${slug}${release.date.toISOString()}`);
}
return release;
}
async function fetchLatest(site, page = 1) {
const url = `${site.url}/movies/page-${page}`;
const res = await bhttp.get(url);
if (res.statusCode === 200) {
return scrapeLatest(res.body.toString(), site);
}
return [];
}
async function fetchScene(url, site, release) {
const res = await bhttp.get(url);
if (res.statusCode === 200) {
return scrapeScene(res.body.toString(), site, url, release?.meta.siteSlugs);
}
return [];
}
module.exports = {
fetchLatest,
fetchScene,
};

View File

@ -3,13 +3,12 @@
const bhttp = require('bhttp'); const bhttp = require('bhttp');
const { JSDOM } = require('jsdom'); const { JSDOM } = require('jsdom');
const moment = require('moment'); const moment = require('moment');
const ex = require('../utils/ex');
function scrapeLatest(html, site) { function scrapeLatest(html, site) {
const s = ex(html); const { document } = new JSDOM(html).window;
const { origin } = new URL(site.url); const { origin } = new URL(site.url);
const videos = s.qa('.video-releases-list').slice(-1)[0]; const videos = document.querySelectorAll('.video-releases-list').slice(-1)[0];
return Array.from(videos.querySelectorAll('.card'), (scene) => { return Array.from(videos.querySelectorAll('.card'), (scene) => {
const release = { site }; const release = { site };

View File

@ -14,6 +14,7 @@ const jayrock = require('./jayrock');
const kink = require('./kink'); const kink = require('./kink');
const mikeadriano = require('./mikeadriano'); const mikeadriano = require('./mikeadriano');
const mofos = require('./mofos'); const mofos = require('./mofos');
const perfectgonzo = require('./perfectgonzo');
const pervcity = require('./pervcity'); const pervcity = require('./pervcity');
const pornpros = require('./pornpros'); const pornpros = require('./pornpros');
const privateNetwork = require('./private'); // reserved keyword const privateNetwork = require('./private'); // reserved keyword
@ -56,6 +57,7 @@ module.exports = {
legalporno, legalporno,
mikeadriano, mikeadriano,
mofos, mofos,
perfectgonzo,
pervcity, pervcity,
pornpros, pornpros,
private: privateNetwork, private: privateNetwork,

View File

@ -1,23 +0,0 @@
'use strict';
const { JSDOM } = require('jsdom');
function q(context, selector) {
return context.querySelector(selector);
}
function qa(context, selector) {
return Array.from(context.querySelectorAll(selector));
}
function ex(html) {
const { document } = new JSDOM(html).window;
return {
document,
q: selector => q(document, selector),
qa: selector => qa(document, selector),
};
}
module.exports = ex;

107
src/utils/q.js Normal file
View File

@ -0,0 +1,107 @@
'use strict';
const { JSDOM } = require('jsdom');
const moment = require('moment');
function q(context, selector, attrArg, trim = true) {
const attr = attrArg === true ? 'textContent' : attrArg;
if (attr) {
const value = context.querySelector(selector)[attr];
return trim ? value.trim() : value;
}
return context.querySelector(selector);
}
function qall(context, selector, attrArg, trim = true) {
const attr = attrArg === true ? 'textContent' : attrArg;
if (attr) {
return Array.from(context.querySelectorAll(selector), el => (trim ? el[attr]?.trim() : el[attr]));
}
return Array.from(context.querySelectorAll(selector));
}
function qdate(context, selector, format, match, attr = 'textContent') {
const dateString = context.querySelector(selector)[attr];
if (match) {
const dateStamp = dateString.match(match);
if (dateStamp) return moment.utc(dateStamp[0], format).toDate();
return null;
}
return moment.utc(dateString.trim(), format).toDate();
}
function qimages(context, selector = 'img', attr = 'src') {
return qall(context, selector, attr);
}
function qposter(context, selector = 'video', attr = 'poster') {
return q(context, selector, attr);
}
function qtrailer(context, selector = 'source', attr = 'src') {
return q(context, selector, attr);
}
function qlength(context, selector, attr = 'textContent') {
const durationString = q(context, selector, attr);
const duration = durationString.match(/(\d+:)?\d+:\d+/);
if (duration) {
const segments = ['00'].concat(duration[0].split(':')).slice(-3);
return moment.duration(segments.join(':')).asSeconds();
}
return null;
}
const funcs = {
q,
qall,
qdate,
qimages,
qposter,
qlength,
qtrailer,
qa: qall,
qd: qdate,
qi: qimages,
qp: qposter,
ql: qlength,
qt: qtrailer,
};
function ctx(element) {
const contextFuncs = Object.entries(funcs)
.reduce((acc, [key, func]) => ({ ...acc, [key]: (...args) => func(element, ...args) }), {});
return {
element,
...contextFuncs,
};
}
function ctxa(context, selector) {
return Array.from(context.querySelectorAll(selector)).map(element => ctx(element));
}
function ex(html) {
const { document } = new JSDOM(html).window;
return ctx(document);
}
module.exports = {
ex,
ctx,
ctxa,
...funcs,
};