Enabled pagination on network page.

This commit is contained in:
2020-05-26 04:11:29 +02:00
parent fe69ec4175
commit 86377fec5f
43 changed files with 164 additions and 81 deletions

View File

@@ -16,7 +16,7 @@ function scrape(html, site) {
return sceneElements.map((element) => {
const sceneLinkElement = $(element).find('.thmb_lnk');
const title = sceneLinkElement.attr('title');
const url = site.legacy
const url = site.parameters?.legacy
? `https://${site.url}{sceneLinkElement.attr('href')}`
: `https://bangbros.com${sceneLinkElement.attr('href')}`;
const shootId = sceneLinkElement.attr('id') && sceneLinkElement.attr('id').split('-')[1];
@@ -150,14 +150,14 @@ function scrapeSceneLegacy({ qu }, url) {
return release;
}
function scrapeProfile(html) {
function scrapeProfile(html, scope) {
const { q } = ex(html);
const profile = {};
const avatar = q('.profilePic img', 'src');
if (avatar) profile.avatar = `https:${avatar}`;
profile.releases = scrape(html);
profile.releases = scrape(html, scope.network);
return profile;
}
@@ -221,7 +221,7 @@ async function fetchScene(url, site, release) {
return scrapeScene(res.item.html, url, site);
}
async function fetchProfile(actorName) {
async function fetchProfile(actorName, scope) {
const actorSlug = slugify(actorName);
const url = `https://bangbros.com/search/${actorSlug}`;
const res = await bhttp.get(url);
@@ -233,7 +233,7 @@ async function fetchProfile(actorName) {
const actorRes = await bhttp.get(actorUrl);
if (actorRes.statusCode === 200) {
return scrapeProfile(actorRes.body.toString());
return scrapeProfile(actorRes.body.toString(), scope);
}
}
}

View File

@@ -100,7 +100,7 @@ async function scrapeScene(html, url, _site) {
const siteElement = $('.niche-site-logo');
// const siteUrl = `https://www.brazzers.com${siteElement.attr('href').slice(0, -1)}`;
const siteName = siteElement.attr('title');
release.channel = siteName.replace(/\s+/g, '').toLowerCase();
release.channel = slugify(siteName, '');
release.tags = $('.tag-card-container a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
release.photos = $('.carousel-thumb a').map((photoIndex, photoElement) => `https:${$(photoElement).attr('href')}`).toArray();

View File

@@ -59,16 +59,17 @@ function scrapeScene(html, url, site) {
const originalUrl = `${protocol}//${hostname}${pathname}`;
const entryId = originalUrl.split('-').slice(-1)[0];
const title = sceneElement.find('h1.scene-title.grey-text').text();
const title = sceneElement.find('h1.scene-title').text();
const description = sceneElement.find('.synopsis').contents().slice(2).text().replace(/[\s\n]+/g, ' ').trim();
const date = moment.utc(sceneElement.find('span.entry-date').text(), 'MMM D, YYYY').toDate();
const actors = $('a.scene-title.grey-text.link').map((actorIndex, actorElement) => $(actorElement).text()).toArray();
const date = moment.utc(sceneElement.find('span.entry-date').text()?.match(/\w+ \d{1,2}, \d{4}/), 'MMM D, YYYY').toDate();
const actors = $('.performer-list a, h1 a.scene-title').map((actorIndex, actorElement) => $(actorElement).text()).toArray();
const duration = Number(sceneElement.find('.duration-ratings .duration').text().slice(10, -4)) * 60;
const poster = `https:${$('video, dl8-video').attr('poster')}`;
const photos = $('.contain-scene-images.desktop-only a').map((index, el) => `https:${$(el).attr('href')}`).toArray();
const posterPath = $('video, dl8-video').attr('poster') || $('img.start-card').attr('src');
const poster = posterPath && `https:${posterPath}`;
const photos = $('.contain-scene-images.desktop-only a').map((index, el) => $(el).attr('href')).toArray().filter(Boolean).map(photo => `https:${photo}`);
const trailerEl = $('source');
const trailerSrc = trailerEl.attr('src');
@@ -90,10 +91,10 @@ function scrapeScene(html, url, site) {
tags,
photos,
poster,
trailer: {
trailer: trailerSrc ? {
src: trailerSrc,
type: trailerType,
},
} : null,
rating: null,
site,
channel,

View File

@@ -25,7 +25,7 @@ function curateReleaseEntry(release, batchId, existingRelease) {
shoot_id: release.shootId || null,
studio_id: release.studio?.id || null,
url: release.url,
date: release.date,
date: Number(release.date) ? release.date : null,
slug,
description: release.description,
duration: release.duration,
@@ -47,7 +47,7 @@ function curateReleaseEntry(release, batchId, existingRelease) {
}
async function attachChannelSites(releases) {
const releasesWithoutSite = releases.filter(release => release.channel && (!release.site || release.site.isNetwork));
const releasesWithoutSite = releases.filter(release => release.channel && (!release.site || release.site.isNetwork || release.site.slug !== release.channel));
const channelSites = await knex('sites')
.leftJoin('networks', 'networks.id', 'sites.network_id')
@@ -58,10 +58,6 @@ async function attachChannelSites(releases) {
const releasesWithChannelSite = await Promise.all(releases
.map(async (release) => {
if (release.site && !release.site.isNetwork) {
return release;
}
if (release.channel && channelSitesBySlug[release.channel]) {
const curatedSite = await curateSite(channelSitesBySlug[release.channel]);
@@ -71,6 +67,11 @@ async function attachChannelSites(releases) {
};
}
if (release.site && !release.site.isNetwork) {
return release;
}
if (release.site && release.site.isNetwork) {
return {
...release,
@@ -129,6 +130,10 @@ function attachReleaseIds(releases, storedReleases) {
function filterInternalDuplicateReleases(releases) {
const releasesBySiteIdAndEntryId = releases.reduce((acc, release) => {
if (!release.site) {
return acc;
}
if (!acc[release.site.id]) {
acc[release.site.id] = {};
}
@@ -221,7 +226,7 @@ async function storeReleases(releases) {
const curatedNewReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId));
const storedReleases = await knex('releases').insert(curatedNewReleaseEntries).returning('*');
const storedReleases = await knex.batchInsert('releases', curatedNewReleaseEntries).returning('*');
// TODO: update duplicate releases
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];

View File

@@ -25,7 +25,7 @@ async function actorPosters(actorNames) {
const source = path.join(config.media.path, poster.path);
const directory = path.join(config.media.path, 'extracted', poster.actor_name);
const target = path.join(directory, `${poster.actor_name} - ${poster.network_name}: ${poster.site_name} - ${poster.title.replace(/[/.]/g, '_')} (${moment.utc(poster.date).format('YYYY-MM-DD')})-${poster.index}.jpeg`);
const target = path.join(directory, `${poster.actor_name} - ${poster.network_name}: ${poster.site_name} - ${poster.title.replace(/[/.]/g, '_') || poster.actor_name} (${moment.utc(poster.date).format('YYYY-MM-DD')})-${poster.index}.jpeg`);
await fs.mkdir(path.join(directory), { recursive: true });
const file = await fs.readFile(source);

View File

@@ -1,14 +1,45 @@
'use strict';
const substitutes = {
à: 'a',
á: 'a',
ä: 'a',
å: 'a',
æ: 'ae',
ç: 'c',
è: 'e',
é: 'e',
ë: 'e',
ì: 'i',
í: 'i',
ï: 'i',
ǹ: 'n',
ń: 'n',
ñ: 'n',
ò: 'o',
ó: 'o',
ö: 'o',
ø: 'o',
œ: 'oe',
ß: 'ss',
ù: 'u',
ú: 'u',
ü: 'u',
: 'y',
ý: 'y',
ÿ: 'y',
};
function slugify(string, delimiter = '-', {
encode = false,
removeAccents = true,
limit = 1000,
} = {}) {
if (!string || typeof string !== 'string') {
return string;
}
const slugComponents = string.trim().toLowerCase().match(/\w+/g);
const slugComponents = string.trim().toLowerCase().match(/[A-Za-zÀ-ÖØ-öø-ÿ]+/g);
if (!slugComponents) {
return '';
@@ -18,6 +49,12 @@ function slugify(string, delimiter = '-', {
const accSlug = `${acc}${index > 0 ? delimiter : ''}${component}`;
if (accSlug.length < limit) {
if (removeAccents) {
return accSlug.replace(/[à-ÿ]/g, (match) => {
return substitutes[match] || '';
});
}
return accSlug;
}