Added 21Sextury scraper. Various improvements.

This commit is contained in:
ThePendulum 2019-04-07 20:51:14 +02:00
parent e78e12a3c5
commit ec056a177a
9 changed files with 294 additions and 75 deletions

View File

@ -2,6 +2,22 @@
The latest releases from your favorite porn studios in one place. The latest releases from your favorite porn studios in one place.
## Supported networks & sites ## Supported networks & sites
* **21Sextury**
* Aletta Ocean Empire
* Anal Queen Alysa
* Anal Teen Angels
* Asshole Fever
* Butt Plays
* Cheating Whore Wives
* Club Sandy
* DP Fanatics
* Deepthroat Frenzy
* Footsie Babes
* Gapeland
* Lets Play Lez
* Lez Cuties
* Pix and Video
* Sex with Kathia Nobili
* **Blowpass** * **Blowpass**
* 1000 Facials * 1000 Facials
* Immoral Live * Immoral Live

View File

@ -2,42 +2,19 @@
module.exports = { module.exports = {
include: [ include: [
'blowpass', ['21sextury', [
['brazzers', [ 'analteenangels',
'assesinpublic', 'assholefever',
'babygotboobs', 'clubsandy',
'bigbuttslikeitbig', 'dpfanatics',
'bigtitsatschool', 'deepthroatfrenzy',
'bigtitsatwork', 'footsiebabes',
'bigtitsinsports', 'gapeland',
'bigtitsinuniform', 'lezcuties',
'bigwetbutts', 'pixandvideo',
'brazzersenespanol',
'brazzersexxtra',
'brazzerslive',
'brazzersvault',
'bustyandreal',
'bustyz',
'buttsandblacks',
'cfnm',
'daywithapornstar',
'dirtymasseur',
'doctoradventures',
'hotandmean',
'hotchicksbigasses',
'jugfuckers',
'milfslikeitbig',
'mommygotboobs',
'momsincontrol',
'pornstarslikeitbig',
'racksandblacks',
'realwifestories',
'sexproadventures',
'shesgonnasquirt',
'teenslikeitbig',
'teenslikeitblack',
'zzseries',
]], ]],
'blowpass',
'brazzers',
'julesjordan', 'julesjordan',
['kink', [ ['kink', [
'boundgangbangs', 'boundgangbangs',

View File

@ -4,6 +4,12 @@
exports.seed = knex => Promise.resolve() exports.seed = knex => Promise.resolve()
.then(() => knex('networks').del()) .then(() => knex('networks').del())
.then(() => knex('networks').insert([ .then(() => knex('networks').insert([
{
id: '21sextury',
name: '21Sextury',
url: 'https://www.21sextury.com',
description: 'Watch all the latest scenes and porn video updates on 21Sextury.com, the best European porn site with the hottest pornstars from all over the world! Watch porn videos from the large network here.',
},
{ {
id: 'blowpass', id: 'blowpass',
name: 'Blowpass', name: 'Blowpass',

View File

@ -4,6 +4,87 @@
exports.seed = knex => Promise.resolve() exports.seed = knex => Promise.resolve()
.then(() => knex('sites').del()) .then(() => knex('sites').del())
.then(() => knex('sites').insert([ .then(() => knex('sites').insert([
// 21Sextury
{
id: 'analteenangels',
name: 'Anal Teen Angels',
label: 'atangl',
url: 'https://www.analteenangels.com',
description: 'AnalTeenAngels is presented by the 21Sextury nextwork and features young, European teens in hardcore anal porn. Watch these barely legal teens have their first anal sex and give up their ass for some anal pounding!',
network_id: '21sextury',
},
{
id: 'assholefever',
name: 'Asshole Fever',
label: 'assfev',
url: 'https://www.assholefever.com',
description: 'Welcome to AssholeFever, the most hardcore anal site on the net. Watch your favorite pornstars and anal sluts from all over the world in big booty hardcore porn, anal gape, beads, anal creampie and more! Look inside if you dare!',
network_id: '21sextury',
},
{
id: 'buttplays',
name: 'Butt Plays',
label: 'buttpl',
url: 'https://www.buttplays.com',
network_id: '21sextury',
parameters: JSON.stringify({ filter: true }),
},
{
id: 'clubsandy',
name: 'Club Sandy',
label: 'csandy',
url: 'https://www.clubsandy.com',
network_id: '21sextury',
parameters: JSON.stringify({ filter: true }),
},
{
id: 'deepthroatfrenzy',
name: 'Deepthroat Frenzy',
label: 'dfrenz',
url: 'https://www.deepthroatfrenzy.com',
network_id: '21sextury',
parameters: JSON.stringify({ filter: true }),
},
{
id: 'dpfanatics',
name: 'DP Fanatics',
label: 'dpftic',
url: 'https://www.dpfanatics.com',
description: 'Welcome to DPFanatics, brought to you by 21Sextury. DP Fanatics brings you the best DP sex and double penetration porn you can find. Double vaginal penetration, double anal, amateur and teen DP inside!',
network_id: '21sextury',
},
{
id: 'footsiebabes',
name: 'Footsie Babes',
label: 'footsi',
url: 'https://www.footsiebabes.com',
description: 'Welcome to FootsieBabes.com, bringing you the best foot porn, teen feet and foot worship you can find on the net. Watch stocking porn, footjobs, feet tickling and more inside!',
network_id: '21sextury',
},
{
id: 'gapeland',
name: 'Gapeland',
label: 'gapeln',
url: 'https://www.gapeland.com',
network_id: '21sextury',
parameters: JSON.stringify({ filter: true }),
},
{
id: 'lezcuties',
name: 'Lez Cuties',
label: 'lezcte',
url: 'https://www.lezcuties.com',
description: 'LezCuties brings you the cutest lesbian coeds and tiny teen lesbians in HD lesbian porn. Watch as European teens explore themselves and lick each other\'s tight lesbian pussy while their parents aren\'t home.',
network_id: '21sextury',
},
{
id: 'pixandvideo',
name: 'Pix and Video',
label: 'pixvid',
url: 'https://www.pixandvideo.com',
network_id: '21sextury',
parameters: JSON.stringify({ filter: true }),
},
// BLOWPASS // BLOWPASS
{ {
id: '1000facials', id: '1000facials',

View File

@ -53,10 +53,13 @@ exports.seed = knex => Promise.resolve()
alias_for: null, alias_for: null,
group_id: 'penetration', group_id: 'penetration',
}, },
{
tag: 'anal creampie',
alias_for: null,
},
{ {
tag: 'anal sex', tag: 'anal sex',
alias_for: null, alias_for: null,
group_id: null,
}, },
{ {
tag: 'anal fingering', tag: 'anal fingering',
@ -629,6 +632,10 @@ exports.seed = knex => Promise.resolve()
tag: 'big cocks', tag: 'big cocks',
alias_for: 'big cock', alias_for: 'big cock',
}, },
{
tag: 'big dick',
alias_for: 'big cock',
},
{ {
tag: 'big butts', tag: 'big butts',
alias_for: 'big butt', alias_for: 'big butt',
@ -1049,6 +1056,10 @@ exports.seed = knex => Promise.resolve()
tag: 'tiny tits', tag: 'tiny tits',
alias_for: 'small boobs', alias_for: 'small boobs',
}, },
{
tag: 'tittyfuck',
alias_for: 'titty fuck',
},
{ {
tag: 'trimmed pussy', tag: 'trimmed pussy',
alias_for: 'trimmed', alias_for: 'trimmed',

View File

@ -96,6 +96,10 @@ async function storeReleases(releases) {
async function fetchNewReleases(scraper, site, afterDate, accReleases = [], page = 1) { async function fetchNewReleases(scraper, site, afterDate, accReleases = [], page = 1) {
const latestReleases = await scraper.fetchLatest(site, page); const latestReleases = await scraper.fetchLatest(site, page);
if (latestReleases.length === 0) {
return [];
}
const duplicateReleases = await findDuplicateReleases(latestReleases, site.id); const duplicateReleases = await findDuplicateReleases(latestReleases, site.id);
const duplicateReleasesIds = new Set( const duplicateReleasesIds = new Set(
duplicateReleases duplicateReleases

141
src/scrapers/21sextury.js Normal file
View File

@ -0,0 +1,141 @@
'use strict';
const bhttp = require('bhttp');
const cheerio = require('cheerio');
const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags');
function scrape(html, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const scenesElements = $('li[data-itemtype=scene]').toArray();
return scenesElements.reduce((accReleases, element) => {
const siteName = $(element).find('.studioName a').attr('title');
if (site.parameters && site.parameters.filter && siteName.toLowerCase() !== site.name.toLowerCase()) {
return accReleases;
}
const sceneLinkElement = $(element).find('.sceneTitle a');
const url = `${site.url}${sceneLinkElement.attr('href')}`;
const title = sceneLinkElement.attr('title').trim();
const entryId = $(element).attr('data-itemid');
const date = moment
.utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY')
.toDate();
const actors = $(element).find('.sceneActors a')
.map((actorIndex, actorElement) => $(actorElement).attr('title'))
.toArray();
const [likes, dislikes] = $(element).find('.value')
.toArray()
.map(value => Number($(value).text()));
return [
...accReleases,
{
url,
entryId,
title,
actors,
date,
rating: {
likes,
dislikes,
},
site,
},
];
}, []);
}
async function scrapeScene(html, url, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const sceneElement = $('#videoWrapper');
const json = $('script[type="application/ld+json"]').html();
const data = JSON.parse(json)[0];
const entryId = new URL(url).pathname.split('/').slice(-1)[0];
const title = data.name;
const date = moment.utc(data.dateCreated, 'YYYY-MM-DD').toDate();
const actors = data.actor
.sort(({ genderA }, { genderB }) => {
if (genderA === 'female' && genderB === 'male') return 1;
if (genderA === 'male' && genderB === 'female') return -1;
return 0;
})
.map(actor => actor.name);
const description = data.description || null; // prevent empty string
const likes = Number(sceneElement.find('.rating .state_1 .value').text());
const dislikes = Number(sceneElement.find('#infoWrapper .rating .state_2 .value').text());
const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();
const rawTags = data.keywords.split(', ');
const siteName = data.productionCompany ? data.productionCompany.name : $('#logoLink a').attr('title');
const siteId = siteName && siteName.replace(/\s+/g, '').toLowerCase();
const [channelSite, tags] = await Promise.all([
site.isFallback
? knex('sites')
.where({ id: siteId })
.orWhereRaw('name = ? collate NOCASE', [siteName])
.first()
: site,
matchTags(rawTags),
]);
// only replace generic URL with site URL if site is not marked to fetch scenes from generic site
const originalUrl = channelSite && !(channelSite.parameters && JSON.parse(channelSite.parameters).filter)
? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}`
: url;
return {
url: originalUrl,
entryId,
title,
date,
actors,
description,
duration,
tags,
rating: {
likes,
dislikes,
},
site: channelSite || site,
};
}
async function fetchLatest(site, page = 1) {
const res = await bhttp.get(`${site.parameters.filter ? 'https://21sextury.com' : site.url}/en/videos/All-Categories/0/All-Pornstars/0/latest/${page}`);
return scrape(res.body.toString(), site);
}
async function fetchUpcoming(site) {
const res = await bhttp.get(`${site.parameters.filter ? 'https://21sextury.com' : site.url}/en/videos/All-Categories/0/All-Pornstars/0/upcoming`);
return scrape(res.body.toString(), site);
}
async function fetchScene(url, site) {
const res = await bhttp.get(url);
return scrapeScene(res.body.toString(), url, site);
}
module.exports = {
fetchLatest,
fetchUpcoming,
fetchScene,
};

View File

@ -9,10 +9,12 @@ const legalporno = require('./legalporno');
const mofos = require('./mofos'); const mofos = require('./mofos');
const pervcity = require('./pervcity'); const pervcity = require('./pervcity');
const privateNetwork = require('./private'); // reserved keyword const privateNetwork = require('./private'); // reserved keyword
const twentyonesextury = require('./21sextury');
const vixen = require('./vixen'); const vixen = require('./vixen');
const xempire = require('./xempire'); const xempire = require('./xempire');
module.exports = { module.exports = {
'21sextury': twentyonesextury,
blowpass, blowpass,
brazzers, brazzers,
ddfnetwork, ddfnetwork,

View File

@ -4,6 +4,7 @@ const bhttp = require('bhttp');
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const moment = require('moment'); const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags'); const { matchTags } = require('../tags');
function scrape(html, site) { function scrape(html, site) {
@ -15,7 +16,7 @@ function scrape(html, site) {
const url = `${site.url}${sceneLinkElement.attr('href')}`; const url = `${site.url}${sceneLinkElement.attr('href')}`;
const title = sceneLinkElement.attr('title'); const title = sceneLinkElement.attr('title');
const shootId = $(element).attr('data-itemid'); const entryId = $(element).attr('data-itemid');
const date = moment const date = moment
.utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY') .utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY')
@ -31,7 +32,7 @@ function scrape(html, site) {
return { return {
url, url,
shootId, entryId,
title, title,
actors, actors,
director: 'Mason', director: 'Mason',
@ -45,44 +46,12 @@ function scrape(html, site) {
}); });
} }
async function scrapeSceneFallback($, url, site) {
const shootId = new URL(url).pathname.split('/').slice(-1)[0];
const title = $('h1.title').text();
const date = moment.utc($('.updatedDate').text(), 'MM-DD-YYYY').toDate();
const actors = $('.sceneColActors a').map((actorIndex, actorElement) => $(actorElement).text()).toArray();
const description = ($('.sceneDesc').text() || '').replace(/Video Description:/g, ' ').trim();
const stars = $('.currentRating').text().split('/')[0] / 2;
const rawTags = $('.sceneColCategories > a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const tags = await matchTags(rawTags);
return {
url,
shootId,
title,
date,
actors,
director: 'Mason',
description,
tags,
rating: {
stars,
},
site,
};
}
async function scrapeScene(html, url, site) { async function scrapeScene(html, url, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true }); const $ = cheerio.load(html, { normalizeWhitespace: true });
const json = $('script[type="application/ld+json"]').html(); const json = $('script[type="application/ld+json"]').html();
if (!json) {
return scrapeSceneFallback($, url, site);
}
const data = JSON.parse(json)[0]; const data = JSON.parse(json)[0];
const shootId = new URL(url).pathname.split('/').slice(-1)[0]; const entryId = new URL(url).pathname.split('/').slice(-1)[0];
const title = data.isPartOf.name; const title = data.isPartOf.name;
const date = moment.utc(data.dateCreated, 'YYYY-MM-DD').toDate(); const date = moment.utc(data.dateCreated, 'YYYY-MM-DD').toDate();
@ -102,11 +71,23 @@ async function scrapeScene(html, url, site) {
const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds(); const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();
const rawTags = data.keywords.split(', '); const rawTags = data.keywords.split(', ');
const tags = await matchTags(rawTags); const siteDomain = $('meta[name="twitter:domain"]').attr('content');
const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase();
const siteUrl = siteDomain && `https://www.${siteDomain}`;
const [channelSite, tags] = await Promise.all([
site.isFallback
? knex('sites')
.where({ url: siteUrl })
.orWhere({ id: siteId })
.first()
: site,
matchTags(rawTags),
]);
return { return {
url, url: channelSite ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}` : url,
shootId, entryId,
title, title,
date, date,
actors, actors,
@ -117,7 +98,7 @@ async function scrapeScene(html, url, site) {
rating: { rating: {
stars, stars,
}, },
site, site: channelSite || site,
}; };
} }