From ec056a177a6c311f02d7239d017cd3be955d2fd5 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Sun, 7 Apr 2019 20:51:14 +0200 Subject: [PATCH] Added 21Sextury scraper. Various improvements. --- README.md | 16 +++++ config/default.js | 47 ++++--------- seeds/networks.js | 6 ++ seeds/sites.js | 81 ++++++++++++++++++++++ seeds/tags.js | 13 +++- src/fetch-releases.js | 4 ++ src/scrapers/21sextury.js | 141 ++++++++++++++++++++++++++++++++++++++ src/scrapers/index.js | 2 + src/scrapers/xempire.js | 59 ++++++---------- 9 files changed, 294 insertions(+), 75 deletions(-) create mode 100644 src/scrapers/21sextury.js diff --git a/README.md b/README.md index 52ae032b..9962c4f0 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,22 @@ The latest releases from your favorite porn studios in one place. ## Supported networks & sites +* **21Sextury** + * Aletta Ocean Empire + * Anal Queen Alysa + * Anal Teen Angels + * Asshole Fever + * Butt Plays + * Cheating Whore Wives + * Club Sandy + * DP Fanatics + * Deepthroat Frenzy + * Footsie Babes + * Gapeland + * Lets Play Lez + * Lez Cuties + * Pix and Video + * Sex with Kathia Nobili * **Blowpass** * 1000 Facials * Immoral Live diff --git a/config/default.js b/config/default.js index c0952490..db9bd328 100644 --- a/config/default.js +++ b/config/default.js @@ -2,42 +2,19 @@ module.exports = { include: [ - 'blowpass', - ['brazzers', [ - 'assesinpublic', - 'babygotboobs', - 'bigbuttslikeitbig', - 'bigtitsatschool', - 'bigtitsatwork', - 'bigtitsinsports', - 'bigtitsinuniform', - 'bigwetbutts', - 'brazzersenespanol', - 'brazzersexxtra', - 'brazzerslive', - 'brazzersvault', - 'bustyandreal', - 'bustyz', - 'buttsandblacks', - 'cfnm', - 'daywithapornstar', - 'dirtymasseur', - 'doctoradventures', - 'hotandmean', - 'hotchicksbigasses', - 'jugfuckers', - 'milfslikeitbig', - 'mommygotboobs', - 'momsincontrol', - 'pornstarslikeitbig', - 'racksandblacks', - 'realwifestories', - 'sexproadventures', - 'shesgonnasquirt', - 'teenslikeitbig', - 'teenslikeitblack', - 'zzseries', + ['21sextury', [ + 'analteenangels', + 'assholefever', + 'clubsandy', + 'dpfanatics', + 'deepthroatfrenzy', + 'footsiebabes', + 'gapeland', + 'lezcuties', + 'pixandvideo', ]], + 'blowpass', + 'brazzers', 'julesjordan', ['kink', [ 'boundgangbangs', diff --git a/seeds/networks.js b/seeds/networks.js index aad451aa..3ca3750b 100644 --- a/seeds/networks.js +++ b/seeds/networks.js @@ -4,6 +4,12 @@ exports.seed = knex => Promise.resolve() .then(() => knex('networks').del()) .then(() => knex('networks').insert([ + { + id: '21sextury', + name: '21Sextury', + url: 'https://www.21sextury.com', + description: 'Watch all the latest scenes and porn video updates on 21Sextury.com, the best European porn site with the hottest pornstars from all over the world! Watch porn videos from the large network here.', + }, { id: 'blowpass', name: 'Blowpass', diff --git a/seeds/sites.js b/seeds/sites.js index 48edfd42..4ea33e9a 100644 --- a/seeds/sites.js +++ b/seeds/sites.js @@ -4,6 +4,87 @@ exports.seed = knex => Promise.resolve() .then(() => knex('sites').del()) .then(() => knex('sites').insert([ + // 21Sextury + { + id: 'analteenangels', + name: 'Anal Teen Angels', + label: 'atangl', + url: 'https://www.analteenangels.com', + description: 'AnalTeenAngels is presented by the 21Sextury nextwork and features young, European teens in hardcore anal porn. Watch these barely legal teens have their first anal sex and give up their ass for some anal pounding!', + network_id: '21sextury', + }, + { + id: 'assholefever', + name: 'Asshole Fever', + label: 'assfev', + url: 'https://www.assholefever.com', + description: 'Welcome to AssholeFever, the most hardcore anal site on the net. Watch your favorite pornstars and anal sluts from all over the world in big booty hardcore porn, anal gape, beads, anal creampie and more! Look inside if you dare!', + network_id: '21sextury', + }, + { + id: 'buttplays', + name: 'Butt Plays', + label: 'buttpl', + url: 'https://www.buttplays.com', + network_id: '21sextury', + parameters: JSON.stringify({ filter: true }), + }, + { + id: 'clubsandy', + name: 'Club Sandy', + label: 'csandy', + url: 'https://www.clubsandy.com', + network_id: '21sextury', + parameters: JSON.stringify({ filter: true }), + }, + { + id: 'deepthroatfrenzy', + name: 'Deepthroat Frenzy', + label: 'dfrenz', + url: 'https://www.deepthroatfrenzy.com', + network_id: '21sextury', + parameters: JSON.stringify({ filter: true }), + }, + { + id: 'dpfanatics', + name: 'DP Fanatics', + label: 'dpftic', + url: 'https://www.dpfanatics.com', + description: 'Welcome to DPFanatics, brought to you by 21Sextury. DP Fanatics brings you the best DP sex and double penetration porn you can find. Double vaginal penetration, double anal, amateur and teen DP inside!', + network_id: '21sextury', + }, + { + id: 'footsiebabes', + name: 'Footsie Babes', + label: 'footsi', + url: 'https://www.footsiebabes.com', + description: 'Welcome to FootsieBabes.com, bringing you the best foot porn, teen feet and foot worship you can find on the net. Watch stocking porn, footjobs, feet tickling and more inside!', + network_id: '21sextury', + }, + { + id: 'gapeland', + name: 'Gapeland', + label: 'gapeln', + url: 'https://www.gapeland.com', + network_id: '21sextury', + parameters: JSON.stringify({ filter: true }), + }, + { + id: 'lezcuties', + name: 'Lez Cuties', + label: 'lezcte', + url: 'https://www.lezcuties.com', + description: 'LezCuties brings you the cutest lesbian coeds and tiny teen lesbians in HD lesbian porn. Watch as European teens explore themselves and lick each other\'s tight lesbian pussy while their parents aren\'t home.', + network_id: '21sextury', + }, + { + id: 'pixandvideo', + name: 'Pix and Video', + label: 'pixvid', + url: 'https://www.pixandvideo.com', + network_id: '21sextury', + parameters: JSON.stringify({ filter: true }), + }, // BLOWPASS { id: '1000facials', diff --git a/seeds/tags.js b/seeds/tags.js index f076c39c..6fe50717 100644 --- a/seeds/tags.js +++ b/seeds/tags.js @@ -53,10 +53,13 @@ exports.seed = knex => Promise.resolve() alias_for: null, group_id: 'penetration', }, + { + tag: 'anal creampie', + alias_for: null, + }, { tag: 'anal sex', alias_for: null, - group_id: null, }, { tag: 'anal fingering', @@ -629,6 +632,10 @@ exports.seed = knex => Promise.resolve() tag: 'big cocks', alias_for: 'big cock', }, + { + tag: 'big dick', + alias_for: 'big cock', + }, { tag: 'big butts', alias_for: 'big butt', @@ -1049,6 +1056,10 @@ exports.seed = knex => Promise.resolve() tag: 'tiny tits', alias_for: 'small boobs', }, + { + tag: 'tittyfuck', + alias_for: 'titty fuck', + }, { tag: 'trimmed pussy', alias_for: 'trimmed', diff --git a/src/fetch-releases.js b/src/fetch-releases.js index 1495dbe9..80e44d83 100644 --- a/src/fetch-releases.js +++ b/src/fetch-releases.js @@ -96,6 +96,10 @@ async function storeReleases(releases) { async function fetchNewReleases(scraper, site, afterDate, accReleases = [], page = 1) { const latestReleases = await scraper.fetchLatest(site, page); + if (latestReleases.length === 0) { + return []; + } + const duplicateReleases = await findDuplicateReleases(latestReleases, site.id); const duplicateReleasesIds = new Set( duplicateReleases diff --git a/src/scrapers/21sextury.js b/src/scrapers/21sextury.js new file mode 100644 index 00000000..be6998a7 --- /dev/null +++ b/src/scrapers/21sextury.js @@ -0,0 +1,141 @@ +'use strict'; + +const bhttp = require('bhttp'); +const cheerio = require('cheerio'); +const moment = require('moment'); + +const knex = require('../knex'); +const { matchTags } = require('../tags'); + +function scrape(html, site) { + const $ = cheerio.load(html, { normalizeWhitespace: true }); + const scenesElements = $('li[data-itemtype=scene]').toArray(); + + return scenesElements.reduce((accReleases, element) => { + const siteName = $(element).find('.studioName a').attr('title'); + + if (site.parameters && site.parameters.filter && siteName.toLowerCase() !== site.name.toLowerCase()) { + return accReleases; + } + + const sceneLinkElement = $(element).find('.sceneTitle a'); + const url = `${site.url}${sceneLinkElement.attr('href')}`; + const title = sceneLinkElement.attr('title').trim(); + + const entryId = $(element).attr('data-itemid'); + + const date = moment + .utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY') + .toDate(); + + const actors = $(element).find('.sceneActors a') + .map((actorIndex, actorElement) => $(actorElement).attr('title')) + .toArray(); + + const [likes, dislikes] = $(element).find('.value') + .toArray() + .map(value => Number($(value).text())); + + return [ + ...accReleases, + { + url, + entryId, + title, + actors, + date, + rating: { + likes, + dislikes, + }, + site, + }, + ]; + }, []); +} + +async function scrapeScene(html, url, site) { + const $ = cheerio.load(html, { normalizeWhitespace: true }); + const sceneElement = $('#videoWrapper'); + const json = $('script[type="application/ld+json"]').html(); + + const data = JSON.parse(json)[0]; + const entryId = new URL(url).pathname.split('/').slice(-1)[0]; + + const title = data.name; + const date = moment.utc(data.dateCreated, 'YYYY-MM-DD').toDate(); + + const actors = data.actor + .sort(({ genderA }, { genderB }) => { + if (genderA === 'female' && genderB === 'male') return 1; + if (genderA === 'male' && genderB === 'female') return -1; + + return 0; + }) + .map(actor => actor.name); + + const description = data.description || null; // prevent empty string + const likes = Number(sceneElement.find('.rating .state_1 .value').text()); + const dislikes = Number(sceneElement.find('#infoWrapper .rating .state_2 .value').text()); + + const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds(); + + const rawTags = data.keywords.split(', '); + const siteName = data.productionCompany ? data.productionCompany.name : $('#logoLink a').attr('title'); + const siteId = siteName && siteName.replace(/\s+/g, '').toLowerCase(); + + const [channelSite, tags] = await Promise.all([ + site.isFallback + ? knex('sites') + .where({ id: siteId }) + .orWhereRaw('name = ? collate NOCASE', [siteName]) + .first() + : site, + matchTags(rawTags), + ]); + + // only replace generic URL with site URL if site is not marked to fetch scenes from generic site + const originalUrl = channelSite && !(channelSite.parameters && JSON.parse(channelSite.parameters).filter) + ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}` + : url; + + return { + url: originalUrl, + entryId, + title, + date, + actors, + description, + duration, + tags, + rating: { + likes, + dislikes, + }, + site: channelSite || site, + }; +} + +async function fetchLatest(site, page = 1) { + const res = await bhttp.get(`${site.parameters.filter ? 'https://21sextury.com' : site.url}/en/videos/All-Categories/0/All-Pornstars/0/latest/${page}`); + + return scrape(res.body.toString(), site); +} + +async function fetchUpcoming(site) { + const res = await bhttp.get(`${site.parameters.filter ? 'https://21sextury.com' : site.url}/en/videos/All-Categories/0/All-Pornstars/0/upcoming`); + + return scrape(res.body.toString(), site); +} + +async function fetchScene(url, site) { + const res = await bhttp.get(url); + + return scrapeScene(res.body.toString(), url, site); +} + +module.exports = { + fetchLatest, + fetchUpcoming, + fetchScene, +}; diff --git a/src/scrapers/index.js b/src/scrapers/index.js index 1ffb420f..d1578a26 100644 --- a/src/scrapers/index.js +++ b/src/scrapers/index.js @@ -9,10 +9,12 @@ const legalporno = require('./legalporno'); const mofos = require('./mofos'); const pervcity = require('./pervcity'); const privateNetwork = require('./private'); // reserved keyword +const twentyonesextury = require('./21sextury'); const vixen = require('./vixen'); const xempire = require('./xempire'); module.exports = { + '21sextury': twentyonesextury, blowpass, brazzers, ddfnetwork, diff --git a/src/scrapers/xempire.js b/src/scrapers/xempire.js index ab7108e3..839546c7 100644 --- a/src/scrapers/xempire.js +++ b/src/scrapers/xempire.js @@ -4,6 +4,7 @@ const bhttp = require('bhttp'); const cheerio = require('cheerio'); const moment = require('moment'); +const knex = require('../knex'); const { matchTags } = require('../tags'); function scrape(html, site) { @@ -15,7 +16,7 @@ function scrape(html, site) { const url = `${site.url}${sceneLinkElement.attr('href')}`; const title = sceneLinkElement.attr('title'); - const shootId = $(element).attr('data-itemid'); + const entryId = $(element).attr('data-itemid'); const date = moment .utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY') @@ -31,7 +32,7 @@ function scrape(html, site) { return { url, - shootId, + entryId, title, actors, director: 'Mason', @@ -45,44 +46,12 @@ function scrape(html, site) { }); } -async function scrapeSceneFallback($, url, site) { - const shootId = new URL(url).pathname.split('/').slice(-1)[0]; - const title = $('h1.title').text(); - const date = moment.utc($('.updatedDate').text(), 'MM-DD-YYYY').toDate(); - const actors = $('.sceneColActors a').map((actorIndex, actorElement) => $(actorElement).text()).toArray(); - - const description = ($('.sceneDesc').text() || '').replace(/Video Description:/g, ' ').trim(); - const stars = $('.currentRating').text().split('/')[0] / 2; - - const rawTags = $('.sceneColCategories > a').map((tagIndex, tagElement) => $(tagElement).text()).toArray(); - const tags = await matchTags(rawTags); - - return { - url, - shootId, - title, - date, - actors, - director: 'Mason', - description, - tags, - rating: { - stars, - }, - site, - }; -} - async function scrapeScene(html, url, site) { const $ = cheerio.load(html, { normalizeWhitespace: true }); const json = $('script[type="application/ld+json"]').html(); - if (!json) { - return scrapeSceneFallback($, url, site); - } - const data = JSON.parse(json)[0]; - const shootId = new URL(url).pathname.split('/').slice(-1)[0]; + const entryId = new URL(url).pathname.split('/').slice(-1)[0]; const title = data.isPartOf.name; const date = moment.utc(data.dateCreated, 'YYYY-MM-DD').toDate(); @@ -102,11 +71,23 @@ async function scrapeScene(html, url, site) { const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds(); const rawTags = data.keywords.split(', '); - const tags = await matchTags(rawTags); + const siteDomain = $('meta[name="twitter:domain"]').attr('content'); + const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase(); + const siteUrl = siteDomain && `https://www.${siteDomain}`; + + const [channelSite, tags] = await Promise.all([ + site.isFallback + ? knex('sites') + .where({ url: siteUrl }) + .orWhere({ id: siteId }) + .first() + : site, + matchTags(rawTags), + ]); return { - url, - shootId, + url: channelSite ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}` : url, + entryId, title, date, actors, @@ -117,7 +98,7 @@ async function scrapeScene(html, url, site) { rating: { stars, }, - site, + site: channelSite || site, }; }