Extracting shoot IDs from title in PornBox scraper.
This commit is contained in:
parent
f76341f0dd
commit
762e605bd1
|
|
@ -94,7 +94,7 @@
|
||||||
"tunnel": "0.0.6",
|
"tunnel": "0.0.6",
|
||||||
"ua-parser-js": "^1.0.37",
|
"ua-parser-js": "^1.0.37",
|
||||||
"undici": "^5.28.1",
|
"undici": "^5.28.1",
|
||||||
"unprint": "^0.18.11",
|
"unprint": "^0.18.13",
|
||||||
"url-pattern": "^1.0.3",
|
"url-pattern": "^1.0.3",
|
||||||
"v-tooltip": "^2.1.3",
|
"v-tooltip": "^2.1.3",
|
||||||
"video.js": "^8.6.1",
|
"video.js": "^8.6.1",
|
||||||
|
|
@ -20380,9 +20380,9 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/unprint": {
|
"node_modules/unprint": {
|
||||||
"version": "0.18.11",
|
"version": "0.18.13",
|
||||||
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.18.11.tgz",
|
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.18.13.tgz",
|
||||||
"integrity": "sha512-mHOfweWWLqhEIRnjhdqCzEpHhIx+m/GwE2eDvJNNbnVEPbV8q8EaN6eGH3vkcAwDVgNIOakZaTZFK+VKy13Lsg==",
|
"integrity": "sha512-vjUF7X7/dg2Os/zesJ0+23eVc7NH2oKzspPSyBzcIx6IuEcVm1rdlD9dAxdaRMUNBWEeA5ekyk263CBI3lyaBQ==",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"bottleneck": "^2.19.5",
|
"bottleneck": "^2.19.5",
|
||||||
"cookie": "^1.1.1",
|
"cookie": "^1.1.1",
|
||||||
|
|
|
||||||
|
|
@ -153,7 +153,7 @@
|
||||||
"tunnel": "0.0.6",
|
"tunnel": "0.0.6",
|
||||||
"ua-parser-js": "^1.0.37",
|
"ua-parser-js": "^1.0.37",
|
||||||
"undici": "^5.28.1",
|
"undici": "^5.28.1",
|
||||||
"unprint": "^0.18.11",
|
"unprint": "^0.18.13",
|
||||||
"url-pattern": "^1.0.3",
|
"url-pattern": "^1.0.3",
|
||||||
"v-tooltip": "^2.1.3",
|
"v-tooltip": "^2.1.3",
|
||||||
"video.js": "^8.6.1",
|
"video.js": "^8.6.1",
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
const unprint = require('unprint');
|
const unprint = require('unprint');
|
||||||
|
|
||||||
const http = require('../utils/http');
|
|
||||||
const slugify = require('../utils/slugify');
|
const slugify = require('../utils/slugify');
|
||||||
|
|
||||||
function extractTitle(originalTitle) {
|
function extractTitle(originalTitle) {
|
||||||
|
|
@ -43,6 +42,25 @@ function scrapeAll(scenes, channel) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function fetchLatest(channel, page) {
|
||||||
|
// const res = await unprint.get(`https://www.analvids.com/new-videos/${page}`, { selectAll: '.card-scene' }); // analvids as channel
|
||||||
|
// studios as channels
|
||||||
|
const url = `${channel.url}/latest/${page}`;
|
||||||
|
|
||||||
|
const res = await unprint.get(url, {
|
||||||
|
selectAll: '.card-scene',
|
||||||
|
headers: {
|
||||||
|
Referer: url,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (res.ok) {
|
||||||
|
return scrapeAll(res.context, channel);
|
||||||
|
}
|
||||||
|
|
||||||
|
return res.status;
|
||||||
|
}
|
||||||
|
|
||||||
function scrapeScene({ query }, url) {
|
function scrapeScene({ query }, url) {
|
||||||
const release = {};
|
const release = {};
|
||||||
|
|
||||||
|
|
@ -76,71 +94,6 @@ function scrapeScene({ query }, url) {
|
||||||
return release;
|
return release;
|
||||||
}
|
}
|
||||||
|
|
||||||
function scrapeProfile({ query }, url, channel) {
|
|
||||||
const profile = { url };
|
|
||||||
|
|
||||||
profile.nationality = query.content('.model__info a[href*="/nationality"]');
|
|
||||||
profile.age = query.number('//td[contains(text(), "Age")]/following-sibling::td');
|
|
||||||
|
|
||||||
profile.avatar = query.img('.model__left img');
|
|
||||||
|
|
||||||
profile.scenes = scrapeAll(unprint.initAll(query.all('.card-scene')), channel);
|
|
||||||
|
|
||||||
return profile;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function fetchLatest(channel, page) {
|
|
||||||
// const res = await unprint.get(`https://www.analvids.com/new-videos/${page}`, { selectAll: '.card-scene' }); // analvids as channel
|
|
||||||
// studios as channels
|
|
||||||
const url = `${channel.url}/latest/${page}`;
|
|
||||||
|
|
||||||
const res = await unprint.get(url, {
|
|
||||||
selectAll: '.card-scene',
|
|
||||||
headers: {
|
|
||||||
Referer: url,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
if (res.ok) {
|
|
||||||
return scrapeAll(res.context, channel);
|
|
||||||
}
|
|
||||||
|
|
||||||
return res.status;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
async function fetchLatest(channel, page) {
|
|
||||||
// const res = await unprint.get(`https://www.analvids.com/new-videos/${page}`, { selectAll: '.card-scene' }); // analvids as channel
|
|
||||||
// const res = await unprint.get(`${channel.url}/latest/${page}`, { selectAll: '.card-scene' }); // studios as channels
|
|
||||||
const url = `${channel.url}/latest/${page}`; // studios as channels
|
|
||||||
|
|
||||||
const { tab } = await http.getBrowserSession('analvids', {
|
|
||||||
bypass: {
|
|
||||||
headless: false,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
const res = await tab.goto(url);
|
|
||||||
|
|
||||||
const status = res.status();
|
|
||||||
|
|
||||||
console.log('STATUS', status);
|
|
||||||
|
|
||||||
if (status === 200) {
|
|
||||||
const html = await tab.content();
|
|
||||||
const context = unprint.initAll(html, '.card-scene'); // studios as channels
|
|
||||||
|
|
||||||
const scenes = scrapeAll(context, channel);
|
|
||||||
|
|
||||||
tab.close();
|
|
||||||
|
|
||||||
return scenes;
|
|
||||||
}
|
|
||||||
|
|
||||||
return res.status;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
async function fetchScene(url) {
|
async function fetchScene(url) {
|
||||||
const res = await unprint.get(url, {
|
const res = await unprint.get(url, {
|
||||||
headers: {
|
headers: {
|
||||||
|
|
@ -155,6 +108,19 @@ async function fetchScene(url) {
|
||||||
return res.status;
|
return res.status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function scrapeProfile({ query }, url, channel) {
|
||||||
|
const profile = { url };
|
||||||
|
|
||||||
|
profile.nationality = query.content('.model__info a[href*="/nationality"]');
|
||||||
|
profile.age = query.number('//td[contains(text(), "Age")]/following-sibling::td');
|
||||||
|
|
||||||
|
profile.avatar = query.img('.model__left img');
|
||||||
|
|
||||||
|
profile.scenes = scrapeAll(unprint.initAll(query.all('.card-scene')), channel);
|
||||||
|
|
||||||
|
return profile;
|
||||||
|
}
|
||||||
|
|
||||||
async function getActorUrl(actor, channel) {
|
async function getActorUrl(actor, channel) {
|
||||||
if (actor.url) {
|
if (actor.url) {
|
||||||
return actor.url;
|
return actor.url;
|
||||||
|
|
@ -162,7 +128,7 @@ async function getActorUrl(actor, channel) {
|
||||||
|
|
||||||
const searchUrl = `${channel.url}/api/autocomplete/search?q=${slugify(actor.name, '+')}`;
|
const searchUrl = `${channel.url}/api/autocomplete/search?q=${slugify(actor.name, '+')}`;
|
||||||
|
|
||||||
const searchRes = await http.get(searchUrl, {
|
const searchRes = await unprint.get(searchUrl, {
|
||||||
headers: {
|
headers: {
|
||||||
Referer: actor.url,
|
Referer: actor.url,
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -25,14 +25,24 @@ async function getTrailer(data) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function extractShootId(title) {
|
||||||
|
if (!title) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return title.trim().match(/[A-Z]{2,3}\d{3,4}\w?/)?.[0].toUpperCase();
|
||||||
|
}
|
||||||
|
|
||||||
async function scrapeScene(data, channel, include) {
|
async function scrapeScene(data, channel, include) {
|
||||||
const release = {};
|
const release = {};
|
||||||
const entityUrl = new URL(channel.url).origin;
|
const entityUrl = new URL(channel.url).origin;
|
||||||
|
|
||||||
|
release.entryId = data.id;
|
||||||
|
|
||||||
release.title = data.scene_name || data.custom_name;
|
release.title = data.scene_name || data.custom_name;
|
||||||
|
|
||||||
release.entryId = data.id;
|
release.url = `${entityUrl}/watch/${data.id}/${slugify(release.title, '_') || ''}`;
|
||||||
release.url = `${entityUrl}/watch/${data.id}/${slugify(release.title, '_')}`;
|
release.shootId = extractShootId(release.title);
|
||||||
|
|
||||||
release.date = new Date(data.release_date || data.publish_date);
|
release.date = new Date(data.release_date || data.publish_date);
|
||||||
release.duration = unprint.extractDuration(data.runtime);
|
release.duration = unprint.extractDuration(data.runtime);
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ function slugify(strings, delimiter = '-', {
|
||||||
symbolRegex = defaultSymbolRegex,
|
symbolRegex = defaultSymbolRegex,
|
||||||
} = {}) {
|
} = {}) {
|
||||||
if (!strings || (typeof strings !== 'string' && !Array.isArray(strings))) {
|
if (!strings || (typeof strings !== 'string' && !Array.isArray(strings))) {
|
||||||
return strings;
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
const string = [].concat(strings).join(' ');
|
const string = [].concat(strings).join(' ');
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue