Updated Full Porn Network scraper.

This commit is contained in:
2020-05-29 22:43:03 +02:00
parent 9903423caf
commit 8abcc7194a
104 changed files with 75 additions and 47 deletions

View File

@@ -77,7 +77,7 @@ function itemsByKey(items, key) {
}
function toBaseSource(rawSource) {
if (rawSource.src || (rawSource.extract && rawSource.url)) {
if (rawSource && (rawSource.src || (rawSource.extract && rawSource.url))) {
const baseSource = {};
if (rawSource.src) baseSource.src = rawSource.src;

View File

@@ -3,85 +3,103 @@
const { get, geta, ctxa } = require('../utils/q');
const slugify = require('../utils/slugify');
function scrapeAll(scenes) {
return scenes.map(({ el, qu }) => {
function scrapeAll(scenes, site) {
return scenes.map(({ _el, qu }) => {
const release = {};
release.entryId = el.dataset.setid || qu.q('.update_thumb', 'id').match(/\w+-\w+-(\d+)-\d+/)[1];
release.url = qu.url('.title');
// release.entryId = el.dataset.setid || qu.q('.update_thumb', 'id').match(/\w+-\w+-(\d+)-\d+/)[1];
release.url = `${site.url}${qu.url('.scene-title a')}`;
release.entryId = new URL(release.url).pathname
.toLowerCase()
.replace(/\/$/, '')
.split('/')
.slice(-1)[0];
release.title = qu.q('.title', true);
release.description = qu.q('.title', 'title');
release.title = qu.q('.scene-title', true);
// release.description = qu.q('.title', 'title');
release.date = qu.date('.video-data > span:last-child', 'YYYY-MM-DD');
release.duration = qu.dur('.video-data > span');
// release.date = qu.date('.video-data > span:last-child', 'YYYY-MM-DD');
const minutes = qu.q('.scene-details', true).match(/(\d+) minutes/)[1];
release.duration = Number(minutes) * 60;
release.actors = qu.all('.update_models a', true);
release.actors = qu.text('.update-models').trim().split(/\s*,\s*/g);
const poster = qu.q('.update_thumb', 'src0_1x');
release.poster = [
poster.replace('-1x', '-2x'),
poster,
];
const poster = qu.img('.scene-thumb img');
if (poster) {
release.poster = [
poster.replace('-1x', '-2x'),
poster,
];
}
return release;
});
}
function scrapeScene({ q, qa, qd, qtx }, url, _site) {
function scrapeScene({ qu }, url, site) {
const release = { url };
release.entryId = q('#image_parent img', 'id').match(/\w+-\w+-(\d+)-\d+/)[1];
release.entryId = new URL(url).pathname
.toLowerCase()
.replace(/\/$/, '')
.split('/')
.slice(-1)[0];
release.title = q('.trailer_title', true);
release.description = qtx('.text p');
release.date = qd('span[data-dateadded]', 'YYYY-MM-DD', null, 'data-dateadded');
release.title = qu.q('h4.text-center', true);
release.description = qu.q('p.hide-for-small-only', true);
release.actors = qa('.update_models a', true);
release.tags = qa('.video-info a[href*="/categories"]', true);
release.actors = qu.all('a[href*="/model"]', true);
release.tags = qu.all('a[href*="/category"]', true);
const poster = q('#image_parent img', 'src0_1x');
release.poster = [
poster.replace('-1x', '-2x'),
poster,
];
const trailer = qu.video('source');
if (trailer) release.trailer = { src: `${site.url}${trailer}` };
return release;
}
function scrapeProfile({ el, q, qtx }) {
function scrapeProfile({ el, qu }, actorName) {
if (slugify(qu.q('h1', true)) !== slugify(actorName)) {
// no 404 when actor is not found
return null;
}
const profile = {};
const description = qtx('.model-bio');
const description = qu.q('h4 + p', true);
if (description) profile.description = description;
profile.avatar = [
q('.model-image img', 'src0_2x'),
q('.model-image img', 'src0_1x'),
];
const avatar = qu.img('main img');
profile.releases = scrapeAll(ctxa(el, '.update'));
if (avatar) {
profile.avatar = [
avatar.replace('set-1x', 'set-2x'),
avatar,
];
}
profile.releases = scrapeAll(ctxa(el, '.update, .scene-update'));
return profile;
}
async function fetchLatest(site, page = 1) {
const url = `${site.url}/categories/movies_${page}_d.html`;
const res = await geta(url, '.latest-updates .update');
const url = `${site.url}/1/scenes/recent/${page}/`;
const res = await geta(url, '.latest-updates .update, .scene-update');
return res.ok ? scrapeAll(res.items, site) : res.status;
}
async function fetchScene(url, site) {
const res = await get(url, '.content-wrapper');
const res = await get(url, 'main');
return res.ok ? scrapeScene(res.item, url, site) : res.status;
return res.ok && res.item ? scrapeScene(res.item, url, site) : res.status;
}
async function fetchProfile(actorName, { site }) {
const actorSlug = slugify(actorName, '');
const url = `${site.url}/models/${actorSlug}.html`;
const url = `${site.url}/1/model/${actorSlug}`;
const res = await get(url);
return res.ok ? scrapeProfile(res.item, actorName) : res.status;

View File

@@ -56,9 +56,7 @@ function slugify(string, delimiter = '-', {
if (accSlug.length < limit) {
if (removeAccents) {
return accSlug.replace(/[à-ÿ]/g, (match) => {
return substitutes[match] || '';
});
return accSlug.replace(/[à-ÿ]/g, match => substitutes[match] || '');
}
return accSlug;

View File

@@ -26,7 +26,7 @@ const {
fetchActors,
} = require('./actors');
function initServer() {
async function initServer() {
const app = express();
const router = Router();