Refactored New Sensations scraper.
|
@ -78,7 +78,7 @@
|
|||
"tunnel": "0.0.6",
|
||||
"ua-parser-js": "^1.0.32",
|
||||
"undici": "^4.13.0",
|
||||
"unprint": "^0.8.2",
|
||||
"unprint": "^0.9.1",
|
||||
"url-pattern": "^1.0.3",
|
||||
"v-tooltip": "^2.0.3",
|
||||
"video.js": "^7.11.4",
|
||||
|
@ -17004,9 +17004,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/unprint": {
|
||||
"version": "0.8.2",
|
||||
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.8.2.tgz",
|
||||
"integrity": "sha512-mCKPDPwtuECbXJJLQbDn2FFbydr6fLKytyS3pymbxcTh2dkk7NFypMjR7qjU2Uv9Fl91hSE48SjYMsWHNKpp4w==",
|
||||
"version": "0.9.1",
|
||||
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.9.1.tgz",
|
||||
"integrity": "sha512-TEdPfly6qOST3Vessa9gNFFkJsSsdWe04k0FCU5XuvdpCSb8eVRrAvLouuyJI/GkIhrn6ZHc9VgX/gRW/R5UcQ==",
|
||||
"dependencies": {
|
||||
"axios": "^0.27.2",
|
||||
"bottleneck": "^2.19.5",
|
||||
|
@ -31449,9 +31449,9 @@
|
|||
"integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw="
|
||||
},
|
||||
"unprint": {
|
||||
"version": "0.8.2",
|
||||
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.8.2.tgz",
|
||||
"integrity": "sha512-mCKPDPwtuECbXJJLQbDn2FFbydr6fLKytyS3pymbxcTh2dkk7NFypMjR7qjU2Uv9Fl91hSE48SjYMsWHNKpp4w==",
|
||||
"version": "0.9.1",
|
||||
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.9.1.tgz",
|
||||
"integrity": "sha512-TEdPfly6qOST3Vessa9gNFFkJsSsdWe04k0FCU5XuvdpCSb8eVRrAvLouuyJI/GkIhrn6ZHc9VgX/gRW/R5UcQ==",
|
||||
"requires": {
|
||||
"axios": "^0.27.2",
|
||||
"bottleneck": "^2.19.5",
|
||||
|
|
|
@ -137,7 +137,7 @@
|
|||
"tunnel": "0.0.6",
|
||||
"ua-parser-js": "^1.0.32",
|
||||
"undici": "^4.13.0",
|
||||
"unprint": "^0.8.2",
|
||||
"unprint": "^0.9.1",
|
||||
"url-pattern": "^1.0.3",
|
||||
"v-tooltip": "^2.0.3",
|
||||
"video.js": "^7.11.4",
|
||||
|
|
Before Width: | Height: | Size: 2.6 KiB After Width: | Height: | Size: 2.6 KiB |
Before Width: | Height: | Size: 2.6 KiB After Width: | Height: | Size: 2.7 KiB |
After Width: | Height: | Size: 7.4 KiB |
After Width: | Height: | Size: 2.7 KiB |
Before Width: | Height: | Size: 979 B After Width: | Height: | Size: 1015 B |
Before Width: | Height: | Size: 2.3 KiB After Width: | Height: | Size: 2.3 KiB |
Before Width: | Height: | Size: 1.0 KiB After Width: | Height: | Size: 1.1 KiB |
Before Width: | Height: | Size: 2.7 KiB After Width: | Height: | Size: 2.7 KiB |
Before Width: | Height: | Size: 933 B After Width: | Height: | Size: 969 B |
Before Width: | Height: | Size: 690 B After Width: | Height: | Size: 726 B |
Before Width: | Height: | Size: 2.4 KiB After Width: | Height: | Size: 2.5 KiB |
Before Width: | Height: | Size: 1.6 KiB After Width: | Height: | Size: 1.7 KiB |
Before Width: | Height: | Size: 2.3 KiB After Width: | Height: | Size: 2.3 KiB |
Before Width: | Height: | Size: 1.7 KiB After Width: | Height: | Size: 1.8 KiB |
After Width: | Height: | Size: 2.6 KiB |
Before Width: | Height: | Size: 1.0 KiB After Width: | Height: | Size: 1.0 KiB |
Before Width: | Height: | Size: 1.2 KiB After Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 8.3 KiB After Width: | Height: | Size: 9.9 KiB |
Before Width: | Height: | Size: 690 B After Width: | Height: | Size: 726 B |
Before Width: | Height: | Size: 983 B After Width: | Height: | Size: 1019 B |
After Width: | Height: | Size: 17 KiB |
Before Width: | Height: | Size: 12 KiB After Width: | Height: | Size: 12 KiB |
Before Width: | Height: | Size: 2.6 KiB After Width: | Height: | Size: 2.7 KiB |
After Width: | Height: | Size: 7.4 KiB |
After Width: | Height: | Size: 2.7 KiB |
Before Width: | Height: | Size: 12 KiB After Width: | Height: | Size: 12 KiB |
Before Width: | Height: | Size: 43 KiB After Width: | Height: | Size: 43 KiB |
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 7.9 KiB After Width: | Height: | Size: 7.9 KiB |
Before Width: | Height: | Size: 17 KiB After Width: | Height: | Size: 17 KiB |
Before Width: | Height: | Size: 7.2 KiB After Width: | Height: | Size: 7.3 KiB |
Before Width: | Height: | Size: 26 KiB After Width: | Height: | Size: 26 KiB |
Before Width: | Height: | Size: 9.8 KiB After Width: | Height: | Size: 9.9 KiB |
Before Width: | Height: | Size: 10 KiB After Width: | Height: | Size: 11 KiB |
BIN
public/img/logos/newsensations/thumbs/shanedieselsbangingbabes.png
Executable file → Normal file
Before Width: | Height: | Size: 13 KiB After Width: | Height: | Size: 13 KiB |
After Width: | Height: | Size: 18 KiB |
Before Width: | Height: | Size: 20 KiB After Width: | Height: | Size: 20 KiB |
Before Width: | Height: | Size: 4.7 KiB After Width: | Height: | Size: 4.8 KiB |
Before Width: | Height: | Size: 6.5 KiB After Width: | Height: | Size: 6.5 KiB |
Before Width: | Height: | Size: 27 KiB After Width: | Height: | Size: 29 KiB |
Before Width: | Height: | Size: 7.2 KiB After Width: | Height: | Size: 7.3 KiB |
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 15 KiB |
|
@ -6856,16 +6856,9 @@ const sites = [
|
|||
parent: 'newsensations',
|
||||
parameters: {
|
||||
siteId: 'hwxxx',
|
||||
block: true,
|
||||
layout: 'block',
|
||||
},
|
||||
},
|
||||
{
|
||||
slug: 'tabutales',
|
||||
name: 'Tabu Tales',
|
||||
url: 'https://www.thetabutales.com',
|
||||
parent: 'newsensations',
|
||||
parameters: { siteId: 'tt' },
|
||||
},
|
||||
{
|
||||
slug: 'nsfamilyxxx',
|
||||
name: 'Family XXX',
|
||||
|
@ -6874,7 +6867,7 @@ const sites = [
|
|||
tags: ['family'],
|
||||
parameters: {
|
||||
siteId: 'famxxx',
|
||||
block: true,
|
||||
layout: 'block',
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -6883,7 +6876,10 @@ const sites = [
|
|||
url: 'https://www.thelesbianexperience.com',
|
||||
parent: 'newsensations',
|
||||
tags: ['lesbian'],
|
||||
parameters: { siteId: 'tle' },
|
||||
parameters: {
|
||||
siteId: 'tle',
|
||||
layout: 'block',
|
||||
},
|
||||
},
|
||||
{
|
||||
slug: 'theromanceseries',
|
||||
|
@ -6892,13 +6888,6 @@ const sites = [
|
|||
parent: 'newsensations',
|
||||
parameters: { siteId: 'rs' },
|
||||
},
|
||||
{
|
||||
slug: 'talesfromtheedge',
|
||||
name: 'Tales From The Edge',
|
||||
url: 'thetalesfromtheedge',
|
||||
parent: 'newsensations',
|
||||
parameters: { siteId: 'ttfte' },
|
||||
},
|
||||
{
|
||||
slug: 'parodypass',
|
||||
name: 'Parody Pass',
|
||||
|
@ -6907,11 +6896,15 @@ const sites = [
|
|||
parameters: { siteId: 'pp' },
|
||||
},
|
||||
{
|
||||
slug: 'shanedieselsbangingbabes',
|
||||
name: 'Shane Diesel\'s Banging Babes',
|
||||
url: 'http://shanedieselsbangingbabes.com',
|
||||
slug: 'shanedieselxxx',
|
||||
name: 'Shane Diesel XXX',
|
||||
alias: ['shane diesel\'s banging babes', 'sdbb'],
|
||||
url: 'https://shanedieselxxx.com',
|
||||
parent: 'newsensations',
|
||||
parameters: { siteId: 'sdbb' },
|
||||
parameters: {
|
||||
siteId: 'sdxxx',
|
||||
layout: 'block',
|
||||
},
|
||||
},
|
||||
{
|
||||
slug: 'unlimitedmilfs',
|
||||
|
@ -6961,7 +6954,25 @@ const sites = [
|
|||
name: 'Fresh Outta High School',
|
||||
url: 'https://www.freshouttahighschool.com',
|
||||
parent: 'newsensations',
|
||||
parameters: { siteId: 'fohs' },
|
||||
parameters: {
|
||||
siteId: 'fohs',
|
||||
layout: 'block',
|
||||
},
|
||||
},
|
||||
// merged with Family XXX or main site
|
||||
{
|
||||
slug: 'tabutales',
|
||||
name: 'Tabu Tales',
|
||||
url: 'https://www.thetabutales.com',
|
||||
parent: 'newsensations',
|
||||
parameters: { siteId: 'tt' },
|
||||
},
|
||||
{
|
||||
slug: 'talesfromtheedge',
|
||||
name: 'Tales From The Edge',
|
||||
url: 'https://www.thetalesfromtheedge.com',
|
||||
parent: 'newsensations',
|
||||
parameters: { siteId: 'ttfte' },
|
||||
},
|
||||
// NUBILES
|
||||
{
|
||||
|
|
|
@ -113,9 +113,6 @@ function fetchMovie(scraper, url, entity, baseRelease, options) {
|
|||
async function scrapeRelease(baseRelease, entitiesByHostname, type = 'scene') {
|
||||
const entity = baseRelease.entity || entitiesByHostname[urlToHostname(baseRelease.url)];
|
||||
|
||||
console.log(entitiesByHostname);
|
||||
console.log(entity);
|
||||
|
||||
if (!entity) {
|
||||
logger.warn(`No entity available for ${baseRelease.url}`);
|
||||
return baseRelease;
|
||||
|
|
|
@ -103,7 +103,7 @@ function urlToHostname(url) {
|
|||
try {
|
||||
const hostname = new URL(url)
|
||||
.hostname
|
||||
.match(/(www\.)(.*)/)?.at(-1);
|
||||
.match(/(www\.)?(.*)/)?.at(-1);
|
||||
|
||||
return hostname;
|
||||
} catch (error) {
|
||||
|
|
|
@ -1,90 +1,165 @@
|
|||
'use strict';
|
||||
|
||||
const { geta, ed } = require('../utils/q');
|
||||
const unprint = require('unprint');
|
||||
const slugify = require('../utils/slugify');
|
||||
|
||||
function scrapeBlockLatest(scenes) {
|
||||
return scenes.map(({ html, qu }) => {
|
||||
function scrapeLatestBlock(scenes) {
|
||||
return scenes.map(({ html, query }) => {
|
||||
const release = {};
|
||||
|
||||
const entryId = qu.q('div[class*="videothumb"]', 'class').match(/videothumb_(\d+)/)
|
||||
|| qu.q('div[id*="videothumb"]', 'id').match(/videothumb_(\d+)/);
|
||||
release.title = query.content('h4 a');
|
||||
release.url = query.url('h4 a');
|
||||
release.date = unprint.extractDate(html, 'MM/DD/YYYY', { match: /\d{2}\/\d{2}\/\d{4}/ });
|
||||
|
||||
release.entryId = entryId[1];
|
||||
release.actors = query.contents('.tour_update_models a');
|
||||
|
||||
release.title = qu.q('h4 a', true);
|
||||
release.url = qu.url('h4 a');
|
||||
release.date = ed(html, 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/);
|
||||
// native videothumb entry ID does not occur on scene page, date is not available on all sites
|
||||
release.entryId = slugify([...release.actors.sort(), release.title]);
|
||||
|
||||
release.actors = qu.all('.tour_update_models a', true);
|
||||
|
||||
release.poster = qu.q('div img').dataset.src;
|
||||
release.photos = [qu.q('div img', 'src0_4x') || qu.q('div img', 'src0_3x') || qu.q('div img', 'src0_2x')];
|
||||
|
||||
release.teaser = qu.video();
|
||||
|
||||
console.log(release);
|
||||
release.poster = query.dataset('.video_placeholder', 'src');
|
||||
release.teaser = query.video();
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
|
||||
function scrapeClassicLatest(scenes) {
|
||||
return scenes.map(({ el, qu }) => {
|
||||
function scrapeLatestClassic(scenes) {
|
||||
return scenes.map(({ query }) => {
|
||||
const release = {};
|
||||
|
||||
release.entryId = el.dataset.setid;
|
||||
release.url = qu.url('a');
|
||||
release.url = query.url('a');
|
||||
|
||||
release.title = qu.q('.update_title_small', true) || qu.q('a:nth-child(2)', true);
|
||||
release.title = query.content('.update_title_small') || query.content('a:nth-child(2)');
|
||||
release.description = query.attribute('a', 'title');
|
||||
|
||||
const description = qu.q('a', 'title');
|
||||
if (description) release.description = description;
|
||||
release.date = query.date('.date_small, .update_date', 'MM/DD/YYYY');
|
||||
|
||||
const date = qu.date('.date_small, .update_date', 'MM/DD/YYYY');
|
||||
if (date) release.date = date;
|
||||
release.duration = query.number('.update_counts') * 60;
|
||||
|
||||
const durationLine = qu.q('.update_counts', true);
|
||||
if (durationLine) release.duration = Number(durationLine.match(/(\d+) min/i)[1]) * 60;
|
||||
const actors = query.contents('.update_models a');
|
||||
|
||||
const actors = qu.all('.update_models a', true);
|
||||
release.actors = actors.length > 0 ? actors : qu.q('.update_models', true).split(/,\s*/);
|
||||
release.actors = actors.length === 0
|
||||
? query.content('.update_models').split(/,\s*/)
|
||||
: actors;
|
||||
|
||||
// native videothumb entry ID does not occur on scene page, date is not available on all sites
|
||||
release.entryId = slugify([...release.actors.sort(), release.title]);
|
||||
|
||||
const photoCount = query.number('.update_thumb', { attribute: 'cnt' });
|
||||
|
||||
const photoCount = qu.q('.update_thumb', 'cnt');
|
||||
[release.poster, ...release.photos] = Array.from({ length: photoCount })
|
||||
.map((value, index) => qu.q('.update_thumb', `src${index}_3x`)
|
||||
|| qu.q('.update_thumb', `src${index}_2x`)
|
||||
|| qu.q('.update_thumb', `src${index}_1x`));
|
||||
.map((value, index) => query.attribute('.update_thumb', `src${index}_3x`)
|
||||
|| query.attribute('.update_thumb', `src${index}_2x`)
|
||||
|| query.attribute('.update_thumb', `src${index}_1x`));
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
|
||||
function scrapeScene({ query }) {
|
||||
function scrapeSceneBlock({ query }) {
|
||||
const release = {};
|
||||
|
||||
release.title = query.content('.indScene h2');
|
||||
release.description = query.content('.description');
|
||||
release.date = query.date('.sceneDateP span', 'MM/DD/YYYY');
|
||||
|
||||
release.actors = query.all('.sceneTextLink .tour_update_models a').map((actorEl) => ({
|
||||
name: unprint.query.content(actorEl),
|
||||
url: unprint.query.url(actorEl, null),
|
||||
}));
|
||||
|
||||
release.duration = query.number('.sceneDateP', { match: /(\d+)\s+min/i, matchIndex: 1 }) * 60;
|
||||
release.entryId = slugify([...release.actors.map((actor) => actor.name).sort(), release.title]);
|
||||
|
||||
release.stars = query.number('.sceneRating');
|
||||
|
||||
release.poster = query.img('#trailer_thumb img[src*=content]');
|
||||
release.trailer = query.video('#trailerVideo source');
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function fetchPhotos(url) {
|
||||
const res = await unprint.get(url);
|
||||
|
||||
if (res.ok) {
|
||||
return res.context.query.imgs('.grid-gallery img');
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
async function scrapeSceneClassic({ query }, context, options) {
|
||||
const release = {};
|
||||
|
||||
release.title = query.content('.update_title');
|
||||
release.description = query.content('.update_description');
|
||||
release.date = query.date('.update_date', 'MM/DD/YYYY');
|
||||
|
||||
release.actors = query.all('.update_models a').map((actorEl) => ({
|
||||
name: unprint.query.content(actorEl),
|
||||
url: unprint.query.url(actorEl, null),
|
||||
}));
|
||||
|
||||
release.entryId = slugify([...release.actors.map((actor) => actor.name).sort(), release.title]);
|
||||
|
||||
release.tags = query.contents('.update_tags a');
|
||||
release.stars = query.number('.gallery_info', { match: /average\s+rating:\s+(\d+\.\d+)/i, matchIndex: 1 });
|
||||
|
||||
release.poster = query.img('#vidplayer', { attribute: 'poster' });
|
||||
release.trailer = query.video('#vidplayer source');
|
||||
|
||||
const gallery = query.url('//a[img[contains(@src, "gallery.gif")]]');
|
||||
|
||||
if (gallery && options.includePhotos) {
|
||||
release.photos = await fetchPhotos(gallery);
|
||||
}
|
||||
|
||||
console.log(release);
|
||||
return release;
|
||||
}
|
||||
|
||||
async function fetchLatest(site, page = 1) {
|
||||
const url = `${site.url}/tour_${site.parameters.siteId}/categories/movies_${page}_d.html`;
|
||||
const res = await unprint.get(url, { selectAll: '.movieBlock, .videoBlock, .update_details, .update_details' });
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
async function fetchLatestClassic(site, page) {
|
||||
if (!site.parameters) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const url = `${site.url}/tour_${site.parameters.siteId}/categories/movies_${page}_d.html`;
|
||||
const res = await geta(url, '.updatesBlock .movieBlock, .updatesBlock .videoBlock, .latest_updates_block .update_details, .category_listing_block .update_details');
|
||||
const res = await fetchLatest(site, page);
|
||||
|
||||
if (res.ok && site.parameters.block) {
|
||||
return scrapeBlockLatest(res.items, site);
|
||||
if (res.ok) {
|
||||
return scrapeLatestClassic(res.context, site);
|
||||
}
|
||||
|
||||
return res.ok ? scrapeClassicLatest(res.items, site) : res.status;
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchLatestBlock(site, page) {
|
||||
if (!site.parameters) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const res = await fetchLatest(site, page);
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeLatestBlock(res.context, site);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
scrapeScene,
|
||||
fetchLatest: fetchLatestClassic,
|
||||
scrapeScene: scrapeSceneClassic,
|
||||
useUnprint: true,
|
||||
block: {
|
||||
scrapeScene: scrapeSceneBlock,
|
||||
fetchLatest: fetchLatestBlock,
|
||||
useUnprint: true,
|
||||
},
|
||||
};
|
||||
|
|
|
@ -36,6 +36,7 @@ async function curateReleaseEntry(release, batchId, existingRelease, type = 'sce
|
|||
url: release.url,
|
||||
date: Number(release.date) ? release.date : null,
|
||||
date_precision: release.datePrecision,
|
||||
duration: Number(release.duration) || null,
|
||||
slug,
|
||||
description: release.description,
|
||||
comment: release.comment,
|
||||
|
|
|
@ -36,21 +36,24 @@ const substitutes = {
|
|||
ỹ: 'y',
|
||||
};
|
||||
|
||||
function slugify(string, delimiter = '-', {
|
||||
function slugify(strings, delimiter = '-', {
|
||||
encode = false,
|
||||
removeAccents = true,
|
||||
removePunctuation = false,
|
||||
limit = 1000,
|
||||
} = {}) {
|
||||
if (!string || typeof string !== 'string') {
|
||||
return string;
|
||||
if (!strings || (typeof strings !== 'string' && !Array.isArray(strings))) {
|
||||
return strings;
|
||||
}
|
||||
|
||||
const slugComponents = string
|
||||
.trim()
|
||||
.toLowerCase()
|
||||
.replace(removePunctuation && /[.,:;'"_-]/g, '')
|
||||
.match(/[A-Za-zÀ-ÖØ-öø-ÿ0-9]+/g);
|
||||
const slugComponents = []
|
||||
.concat(strings)
|
||||
.filter(Boolean)
|
||||
.flatMap((string) => string
|
||||
.trim()
|
||||
.toLowerCase()
|
||||
.replace(removePunctuation && /[.,:;'"_-]/g, '')
|
||||
.match(/[A-Za-zÀ-ÖØ-öø-ÿ0-9]+/g));
|
||||
|
||||
if (!slugComponents) {
|
||||
return '';
|
||||
|
|