Switched to tabs. Adding missing actor entries when scraping actors, with batch ID.
This commit is contained in:
@@ -10,44 +10,44 @@ const slugify = require('../utils/slugify');
|
||||
const { ex } = require('../utils/q');
|
||||
|
||||
function scrape(html, site) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
const sceneElements = $('.echThumb').toArray();
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
const sceneElements = $('.echThumb').toArray();
|
||||
|
||||
return sceneElements.map((element) => {
|
||||
const sceneLinkElement = $(element).find('.thmb_lnk');
|
||||
const title = sceneLinkElement.attr('title');
|
||||
const url = `https://bangbros.com${sceneLinkElement.attr('href')}`;
|
||||
const shootId = sceneLinkElement.attr('id') && sceneLinkElement.attr('id').split('-')[1];
|
||||
const entryId = url.split('/')[3].slice(5);
|
||||
return sceneElements.map((element) => {
|
||||
const sceneLinkElement = $(element).find('.thmb_lnk');
|
||||
const title = sceneLinkElement.attr('title');
|
||||
const url = `https://bangbros.com${sceneLinkElement.attr('href')}`;
|
||||
const shootId = sceneLinkElement.attr('id') && sceneLinkElement.attr('id').split('-')[1];
|
||||
const entryId = url.split('/')[3].slice(5);
|
||||
|
||||
const date = moment.utc($(element).find('.thmb_mr_2 span.faTxt').text(), 'MMM D, YYYY').toDate();
|
||||
const actors = $(element).find('.cast-wrapper a.cast').map((actorIndex, actorElement) => $(actorElement).text().trim()).toArray();
|
||||
const date = moment.utc($(element).find('.thmb_mr_2 span.faTxt').text(), 'MMM D, YYYY').toDate();
|
||||
const actors = $(element).find('.cast-wrapper a.cast').map((actorIndex, actorElement) => $(actorElement).text().trim()).toArray();
|
||||
|
||||
const photoElement = $(element).find('.rollover-image');
|
||||
const poster = `https:${photoElement.attr('data-original')}`;
|
||||
const photoElement = $(element).find('.rollover-image');
|
||||
const poster = `https:${photoElement.attr('data-original')}`;
|
||||
|
||||
const photosUrl = photoElement.attr('data-rollover-url');
|
||||
const photosMaxIndex = photoElement.attr('data-rollover-max-index');
|
||||
const photos = Array.from({ length: photosMaxIndex }, (val, index) => `https:${photosUrl}big${index + 1}.jpg`);
|
||||
const photosUrl = photoElement.attr('data-rollover-url');
|
||||
const photosMaxIndex = photoElement.attr('data-rollover-max-index');
|
||||
const photos = Array.from({ length: photosMaxIndex }, (val, index) => `https:${photosUrl}big${index + 1}.jpg`);
|
||||
|
||||
const duration = moment.duration(`0:${$(element).find('.thmb_pic b.tTm').text()}`).asSeconds();
|
||||
const channel = $(element).find('a[href*="/websites"]').attr('href').split('/').slice(-1)[0];
|
||||
const duration = moment.duration(`0:${$(element).find('.thmb_pic b.tTm').text()}`).asSeconds();
|
||||
const channel = $(element).find('a[href*="/websites"]').attr('href').split('/').slice(-1)[0];
|
||||
|
||||
return {
|
||||
url,
|
||||
entryId,
|
||||
shootId,
|
||||
title,
|
||||
actors,
|
||||
date,
|
||||
duration,
|
||||
poster,
|
||||
photos,
|
||||
rating: null,
|
||||
site,
|
||||
channel,
|
||||
};
|
||||
});
|
||||
return {
|
||||
url,
|
||||
entryId,
|
||||
shootId,
|
||||
title,
|
||||
actors,
|
||||
date,
|
||||
duration,
|
||||
poster,
|
||||
photos,
|
||||
rating: null,
|
||||
site,
|
||||
channel,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
/* no dates available, breaks database
|
||||
@@ -80,63 +80,63 @@ function scrapeUpcoming(html, site) {
|
||||
*/
|
||||
|
||||
function scrapeScene(html, url, _site) {
|
||||
const { qu } = ex(html, '.playerSection');
|
||||
const release = {};
|
||||
const { qu } = ex(html, '.playerSection');
|
||||
const release = {};
|
||||
|
||||
[release.shootId] = qu.q('.vdoTags + .vdoCast', true).match(/\w+$/);
|
||||
[release.entryId] = url.split('/')[3].match(/\d+$/);
|
||||
release.title = qu.q('.ps-vdoHdd h1', true);
|
||||
release.description = qu.q('.vdoDesc', true);
|
||||
[release.shootId] = qu.q('.vdoTags + .vdoCast', true).match(/\w+$/);
|
||||
[release.entryId] = url.split('/')[3].match(/\d+$/);
|
||||
release.title = qu.q('.ps-vdoHdd h1', true);
|
||||
release.description = qu.q('.vdoDesc', true);
|
||||
|
||||
release.actors = qu.all('a[href*="/model"]', true);
|
||||
release.tags = qu.all('.vdoTags a', true);
|
||||
release.actors = qu.all('a[href*="/model"]', true);
|
||||
release.tags = qu.all('.vdoTags a', true);
|
||||
|
||||
release.stars = Number(qu.q('div[class*="like"]', true).match(/^\d+/)[0]) / 20;
|
||||
release.stars = Number(qu.q('div[class*="like"]', true).match(/^\d+/)[0]) / 20;
|
||||
|
||||
const poster = qu.img('img#player-overlay-image');
|
||||
release.poster = [
|
||||
poster,
|
||||
poster.replace('/big_trailer', '/members/450x340'), // load error fallback
|
||||
];
|
||||
const poster = qu.img('img#player-overlay-image');
|
||||
release.poster = [
|
||||
poster,
|
||||
poster.replace('/big_trailer', '/members/450x340'), // load error fallback
|
||||
];
|
||||
|
||||
release.trailer = { src: qu.trailer() };
|
||||
release.trailer = { src: qu.trailer() };
|
||||
|
||||
// all scenes seem to have 12 album photos available, not always included on the page
|
||||
const firstPhotoUrl = ex(html).qu.img('img[data-slider-index="1"]');
|
||||
release.photos = Array.from({ length: 12 }, (val, index) => firstPhotoUrl.replace(/big\d+/, `big${index + 1}`));
|
||||
// all scenes seem to have 12 album photos available, not always included on the page
|
||||
const firstPhotoUrl = ex(html).qu.img('img[data-slider-index="1"]');
|
||||
release.photos = Array.from({ length: 12 }, (val, index) => firstPhotoUrl.replace(/big\d+/, `big${index + 1}`));
|
||||
|
||||
const [channel] = qu.url('a[href*="/websites"]').match(/\w+$/);
|
||||
const [channel] = qu.url('a[href*="/websites"]').match(/\w+$/);
|
||||
|
||||
if (channel === 'bangcasting') release.channel = 'bangbroscasting';
|
||||
if (channel === 'remaster') release.channel = 'bangbrosremastered';
|
||||
else release.channel = channel;
|
||||
if (channel === 'bangcasting') release.channel = 'bangbroscasting';
|
||||
if (channel === 'remaster') release.channel = 'bangbrosremastered';
|
||||
else release.channel = channel;
|
||||
|
||||
return release;
|
||||
return release;
|
||||
}
|
||||
|
||||
function scrapeProfile(html) {
|
||||
const { q } = ex(html);
|
||||
const profile = {};
|
||||
const { q } = ex(html);
|
||||
const profile = {};
|
||||
|
||||
const avatar = q('.profilePic img', 'src');
|
||||
if (avatar) profile.avatar = `https:${avatar}`;
|
||||
const avatar = q('.profilePic img', 'src');
|
||||
if (avatar) profile.avatar = `https:${avatar}`;
|
||||
|
||||
profile.releases = scrape(html);
|
||||
profile.releases = scrape(html);
|
||||
|
||||
return profile;
|
||||
return profile;
|
||||
}
|
||||
|
||||
function scrapeProfileSearch(html, actorName) {
|
||||
const { qu } = ex(html);
|
||||
const actorLink = qu.url(`a[title="${actorName}" i][href*="model"]`);
|
||||
const { qu } = ex(html);
|
||||
const actorLink = qu.url(`a[title="${actorName}" i][href*="model"]`);
|
||||
|
||||
return actorLink ? `https://bangbros.com${actorLink}` : null;
|
||||
return actorLink ? `https://bangbros.com${actorLink}` : null;
|
||||
}
|
||||
|
||||
async function fetchLatest(site, page = 1) {
|
||||
const res = await bhttp.get(`${site.url}/${page}`);
|
||||
const res = await bhttp.get(`${site.url}/${page}`);
|
||||
|
||||
return scrape(res.body.toString(), site);
|
||||
return scrape(res.body.toString(), site);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -148,43 +148,43 @@ async function fetchUpcoming(site) {
|
||||
*/
|
||||
|
||||
async function fetchScene(url, site, release) {
|
||||
if (!release?.date) {
|
||||
logger.warn(`Scraping Bang Bros scene from URL without release date: ${url}`);
|
||||
}
|
||||
if (!release?.date) {
|
||||
logger.warn(`Scraping Bang Bros scene from URL without release date: ${url}`);
|
||||
}
|
||||
|
||||
const { origin } = new URL(url);
|
||||
const res = await bhttp.get(url);
|
||||
const { origin } = new URL(url);
|
||||
const res = await bhttp.get(url);
|
||||
|
||||
if (!/https?:\/\/(www.)?bangbros.com\/?$/.test(origin)) {
|
||||
throw new Error('Cannot fetch from this URL. Please find the scene on https://bangbros.com and try again.');
|
||||
}
|
||||
if (!/https?:\/\/(www.)?bangbros.com\/?$/.test(origin)) {
|
||||
throw new Error('Cannot fetch from this URL. Please find the scene on https://bangbros.com and try again.');
|
||||
}
|
||||
|
||||
return scrapeScene(res.body.toString(), url, site);
|
||||
return scrapeScene(res.body.toString(), url, site);
|
||||
}
|
||||
|
||||
async function fetchProfile(actorName) {
|
||||
const actorSlug = slugify(actorName);
|
||||
const url = `https://bangbros.com/search/${actorSlug}`;
|
||||
const res = await bhttp.get(url);
|
||||
const actorSlug = slugify(actorName);
|
||||
const url = `https://bangbros.com/search/${actorSlug}`;
|
||||
const res = await bhttp.get(url);
|
||||
|
||||
if (res.statusCode === 200) {
|
||||
const actorUrl = scrapeProfileSearch(res.body.toString(), actorName);
|
||||
if (res.statusCode === 200) {
|
||||
const actorUrl = scrapeProfileSearch(res.body.toString(), actorName);
|
||||
|
||||
if (actorUrl) {
|
||||
const actorRes = await bhttp.get(actorUrl);
|
||||
if (actorUrl) {
|
||||
const actorRes = await bhttp.get(actorUrl);
|
||||
|
||||
if (actorRes.statusCode === 200) {
|
||||
return scrapeProfile(actorRes.body.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (actorRes.statusCode === 200) {
|
||||
return scrapeProfile(actorRes.body.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
return null;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchScene,
|
||||
fetchProfile,
|
||||
// fetchUpcoming, no dates available
|
||||
fetchLatest,
|
||||
fetchScene,
|
||||
fetchProfile,
|
||||
// fetchUpcoming, no dates available
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user