Passing context object with site or network instead of scraper slug and 'site or network' to all profile scrapers.

This commit is contained in:
ThePendulum 2020-05-18 03:22:03 +02:00
parent 8733fdc657
commit 885aa4f627
22 changed files with 161 additions and 79 deletions

View File

@ -91,6 +91,10 @@ module.exports = {
'burningangel',
'brazzers',
'milehighmedia',
[
'devilsfilm',
'roccosiffredi',
],
[
'vixen',
'tushy',
@ -112,6 +116,7 @@ module.exports = {
],
'21sextury',
'julesjordan',
'peternorth',
'naughtyamerica',
'cherrypimps',
'pimpxxx',
@ -143,6 +148,10 @@ module.exports = {
'private',
'ddfnetwork',
'bangbros',
[
'silverstonedvd',
'silviasaint',
],
'kellymadison',
'gangbangcreampie',
'gloryholesecrets',

View File

@ -1899,6 +1899,7 @@ const sites = [
parameters: {
latest: '/en/All/scenes/0/latest/',
upcoming: '/en/All/scenes/0/upcoming',
classic: true,
},
},
{
@ -1910,6 +1911,7 @@ const sites = [
parameters: {
latest: '/en/scenes/All/0/',
upcoming: '/en/scenes/All/0/1/upcoming',
classic: true,
},
},
{

View File

@ -137,11 +137,10 @@ async function curateProfile(profile) {
name: profile.name,
avatar: profile.avatar,
scraper: profile.scraper,
site: profile.site,
network: profile.network,
};
curatedProfile.site = profile.site.isNetwork ? null : profile.site;
curatedProfile.network = profile.site.isNetwork ? profile.site : null;
curatedProfile.description = profile.description?.trim() || null;
curatedProfile.nationality = profile.nationality?.trim() || null; // used to derive country when country not available
curatedProfile.ethnicity = profile.ethnicity?.trim() || null;
@ -288,7 +287,7 @@ async function interpolateProfiles(actors) {
profile.tattoos = getLongest(valuesByProperty.tattoos);
profile.piercings = getLongest(valuesByProperty.piercings);
profile.avatar_media_id = avatars.sort((avatarA, avatarB) => avatarB.height - avatarA.height)[0].id;
profile.avatar_media_id = avatars.sort((avatarA, avatarB) => avatarB.height - avatarA.height)[0]?.id || null;
return profile;
});
@ -368,21 +367,25 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
try {
return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
const scraper = scrapers[scraperSlug];
const siteOrNetwork = networksBySlug[scraperSlug] || sitesBySlug[scraperSlug];
const context = {
site: sitesBySlug[scraperSlug] || null,
network: networksBySlug[scraperSlug] || null,
scraper: scraperSlug,
};
if (!scraper?.fetchProfile) {
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
}
if (!siteOrNetwork) {
if (!context.site && !context.network) {
logger.warn(`No site or network found for ${scraperSlug}`);
throw new Error(`No site or network found for ${scraperSlug}`);
}
logger.verbose(`Searching profile for '${actor.name}' on '${scraperSlug}'`);
const profile = await scraper.fetchProfile(actor.name, scraperSlug, siteOrNetwork, include);
const profile = await scraper.fetchProfile(actor.name, context, include);
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
logger.verbose(`Profile for '${actor.name}' not available on ${scraperSlug}, scraper returned ${profile}`);
@ -392,8 +395,7 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
return {
...actor,
...profile,
scraper: scraperSlug,
site: siteOrNetwork,
...context,
};
}), Promise.reject(new Error()));
} catch (error) {
@ -424,7 +426,8 @@ async function scrapeActors(actorNames) {
]);
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: { ...network, isNetwork: true } }), {});
const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: network }), {});
const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
@ -456,6 +459,8 @@ async function scrapeActors(actorNames) {
await upsertProfiles(profilesWithAvatarIds);
await interpolateProfiles(actors);
}
return profiles;
}
async function getOrCreateActors(baseActors, batchId) {

View File

@ -20,7 +20,13 @@ async function init() {
}
if (argv.actors) {
await scrapeActors(argv.actors);
const actors = await scrapeActors(argv.actors);
if (argv.actorScenes) {
const actorReleases = actors.map(actor => actor.releases).flat().filter(Boolean);
await storeReleases(actorReleases);
}
}
const updateBaseScenes = (argv.scrape || argv.sites || argv.networks) && await fetchUpdates();

View File

@ -32,13 +32,13 @@ const { argv } = yargs
.option('actor-scenes', {
describe: 'Fetch all scenes for an actor',
type: 'boolean',
alias: 'with-releases',
alias: 'with-scenes',
default: false,
})
.option('movie-scenes', {
describe: 'Fetch all scenes for a movie',
type: 'boolean',
alias: 'with-releases',
alias: 'with-scenes',
default: false,
})
.option('scene-movies', {

View File

@ -126,7 +126,7 @@ async function fetchScene(url, site) {
return res.status;
}
async function fetchProfile(actorName, scraperSlug, site) {
async function fetchProfile(actorName, { site }) {
const actorSlug = slugify(actorName, '');
const url = `${site.url}/tour/models/${actorSlug}.html`;
const res = await get(url, '.page-content .row');

View File

@ -132,7 +132,7 @@ async function fetchScene(url, site) {
return res.ok ? scrapeScene(res.item, url, site) : res.status;
}
async function fetchProfile(actorName, scraperSlug, site, include) {
async function fetchProfile(actorName, { site }, include) {
const actorSlugA = slugify(actorName, '');
const actorSlugB = slugify(actorName);

View File

@ -80,11 +80,69 @@ function scrapeScene(scene, site) {
return release;
}
function scrapeLatest(scenes, site) {
function scrapeAll(scenes, site) {
return scenes.map(({ _source: scene }) => scrapeScene(scene, site));
}
function scrapeProfile(actor) {
async function fetchActorReleases(actor, site) {
const res = await bhttp.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
size: 50,
query: {
bool: {
must: [
{
match: {
status: 'ok',
},
},
{
nested: {
path: 'actors',
query: {
bool: {
must: [
{
match: {
'actors.mongoId': {
operator: 'AND',
query: actor.id,
},
},
},
],
},
},
},
},
],
must_not: [
{
match: {
type: 'trailer',
},
},
],
},
},
sort: [
{
releaseDate: {
order: 'desc',
},
},
],
}, {
encodeJSON: true,
headers: {
Authorization: `Basic ${authKey}`,
},
});
return scrapeAll(res.body.hits.hits, site);
}
async function scrapeProfile(actor, site, include) {
const profile = {};
profile.aliases = actor.aliases;
@ -115,7 +173,9 @@ function scrapeProfile(actor) {
if (actor.twitter) profile.social = [`https://www.twitter.com/${actor.twitter}`];
if (actor.image) profile.avatar = `https://i.bang.com/pornstars/${actor.identifier}.jpg`;
// TODO: get releases
if (include.releases) {
profile.releases = await fetchActorReleases(actor, site);
}
return profile;
}
@ -204,7 +264,7 @@ async function fetchLatest(site, page = 1) {
},
});
return scrapeLatest(res.body.hits.hits, site);
return scrapeAll(res.body.hits.hits, site);
}
async function fetchScene(url, site) {
@ -220,7 +280,7 @@ async function fetchScene(url, site) {
return scrapeScene(res.body._source, site); // eslint-disable-line no-underscore-dangle
}
async function fetchProfile(actorName) {
async function fetchProfile(actorName, actorSlug, site, include) {
const res = await post(`https://${clusterId}.us-east-1.aws.found.io/actors/actor/_search`, {
size: 5,
sort: [{
@ -255,7 +315,7 @@ async function fetchProfile(actorName) {
const actor = res.body.hits.hits.find(hit => hit._source.name.toLowerCase() === actorName.toLowerCase());
if (actor) {
return scrapeProfile(actor._source);
return scrapeProfile(actor._source, site, include);
}
return null;

View File

@ -25,8 +25,8 @@ function getActorReleasesUrl(actorPath, page = 1) {
return `https://www.blowpass.com/en/videos/blowpass/latest/All-Categories/0${actorPath}/${page}`;
}
async function networkFetchProfile(actorName, scraperSlug, site, include) {
return fetchProfile(actorName, scraperSlug, null, getActorReleasesUrl, include);
async function networkFetchProfile(actorName, context, include) {
return fetchProfile(actorName, context, null, getActorReleasesUrl, include);
}
module.exports = {

View File

@ -200,7 +200,7 @@ async function fetchScene(url, site) {
return scrapeScene(res.body.toString(), url, site);
}
async function fetchProfile(actorName, scraperSlug, siteOrNetwork, include) {
async function fetchProfile(actorName, context, include) {
const searchUrl = 'https://brazzers.com/pornstars-search/';
const searchRes = await bhttp.get(searchUrl, {
headers: {

View File

@ -120,13 +120,15 @@ async function fetchScene(url, site, release) {
return res.ok ? scrapeScene(res.item, url, site, release) : res.status;
}
async function fetchProfile(actorName, scraperSlug) {
async function fetchProfile(actorName, { site, network, scraper }) {
const actorSlug = slugify(actorName);
const actorSlug2 = slugify(actorName, '');
const [url, url2] = ['cherrypimps', 'wildoncam'].includes(scraperSlug)
? [`https://${scraperSlug}.com/models/${actorSlug}.html`, `https://${scraperSlug}.com/models/${actorSlug2}.html`]
: [`https://${scraperSlug.replace('xxx', '')}.xxx/models/${actorSlug}.html`, `https://${scraperSlug.replace('xxx', '')}.xxx/models/${actorSlug2}.html`];
const origin = site?.url || network.url;
const [url, url2] = ['cherrypimps', 'wildoncam'].includes(scraper)
? [`${origin}/models/${actorSlug}.html`, `${origin}/models/${actorSlug2}.html`]
: [`${origin}/models/${actorSlug}.html`, `${origin}/models/${actorSlug2}.html`];
const res = await get(url);
if (res.ok) return scrapeProfile(res.item);

View File

@ -10,7 +10,7 @@ const {
fetchApiProfile,
scrapeAll,
} = require('./gamma');
const { get } = require('../utils/q');
const { get } = require('../utils/qu');
const slugify = require('../utils/slugify');
function extractLowArtActors(release) {
@ -57,10 +57,19 @@ function getActorReleasesUrl(actorPath, page = 1) {
return `https://www.peternorth.com/en/videos/All-Categories/0${actorPath}/All-Dvds/0/latest/${page}`;
}
async function fetchClassicProfile(actorName, siteSlug) {
function scrapeClassicProfile({ qu, html }, site) {
const profile = {};
profile.avatar = qu.img('.actorPicture');
profile.releases = scrapeAll(html, null, site.url, false);
return profile;
}
async function fetchClassicProfile(actorName, { site }) {
const actorSlug = slugify(actorName);
const url = `https://${siteSlug}.com/en/pornstars`;
const url = `${site.url}/en/pornstars`;
const pornstarsRes = await get(url);
if (!pornstarsRes.ok) return null;
@ -70,41 +79,23 @@ async function fetchClassicProfile(actorName, siteSlug) {
?.value;
if (actorPath) {
const actorUrl = `https://${siteSlug}.com${actorPath}`;
const actorUrl = `${site.url}${actorPath}`;
const res = await get(actorUrl);
if (res.ok) {
const releases = scrapeAll(res.item, null, `https://www.${siteSlug}.com`, false);
return { releases };
return scrapeClassicProfile(res.item, site);
}
}
return null;
}
async function networkFetchProfile(actorName, scraperSlug, site, include) {
// not all Fame Digital sites offer Gamma actors
const [devils, rocco, peter, silvia] = await Promise.all([
fetchApiProfile(actorName, 'devilsfilm', true),
fetchApiProfile(actorName, 'roccosiffredi'),
include.scenes ? fetchProfile(actorName, 'peternorth', true, getActorReleasesUrl, include) : [],
include.scenes ? fetchClassicProfile(actorName, 'silviasaint') : [],
include.scenes ? fetchClassicProfile(actorName, 'silverstonedvd') : [],
]);
async function networkFetchProfile(actorName, context, include) {
const profile = await ((context.site.parameters.api && fetchApiProfile(actorName, context, include))
|| (context.site.parameters.classic && include.scenes && fetchClassicProfile(actorName, context, include)) // classic profiles only have scenes, no bio
|| fetchProfile(actorName, context, true, getActorReleasesUrl, include));
if (devils || rocco || peter) {
const releases = [].concat(devils?.releases || [], rocco?.releases || [], peter?.releases || [], silvia?.releases || []);
return {
...peter,
...rocco,
...devils,
releases,
};
}
return null;
return profile;
}
module.exports = {

View File

@ -78,12 +78,10 @@ async function fetchScene(url, site) {
return res.ok ? scrapeScene(res.item, url, site) : res.status;
}
async function fetchProfile(actorName, scraperSlug) {
async function fetchProfile(actorName, { site }) {
const actorSlug = slugify(actorName, '');
const url = scraperSlug === 'povperverts'
? `https://povperverts.net/models/${actorSlug}.html`
: `https://${scraperSlug}.com/models/${actorSlug}.html`;
const url = `${site.url}/models/${actorSlug}.html`;
const res = await get(url);
return res.ok ? scrapeProfile(res.item, actorName) : res.status;

View File

@ -552,7 +552,9 @@ async function fetchActorScenes(actorName, apiUrl, siteSlug) {
return [];
}
async function fetchProfile(actorName, siteSlug, altSearchUrl, getActorReleasesUrl, include) {
async function fetchProfile(actorName, context, altSearchUrl, getActorReleasesUrl, include) {
const siteSlug = context.site?.slug || context.network.slug;
const actorSlug = actorName.toLowerCase().replace(/\s+/, '+');
const searchUrl = altSearchUrl
? `https://www.${siteSlug}.com/en/search/${actorSlug}/1/actor`
@ -579,7 +581,9 @@ async function fetchProfile(actorName, siteSlug, altSearchUrl, getActorReleasesU
return null;
}
async function fetchApiProfile(actorName, siteSlug, site, include) {
async function fetchApiProfile(actorName, context, include) {
const siteSlug = context.site?.slug || context.network.slug;
const actorSlug = encodeURI(actorName);
const referer = `https://www.${siteSlug}.com/en/search`;

View File

@ -381,7 +381,7 @@ async function fetchScene(url, site, baseRelease, beforeFetchLatest) {
return scrapeScene(res.item, site, url, baseRelease);
}
async function fetchProfile(actorName, scraperSlug, site) {
async function fetchProfile(actorName, { site }) {
const actorSlugA = slugify(actorName, '');
const actorSlugB = slugify(actorName);

View File

@ -235,7 +235,7 @@ async function fetchScene(url, site) {
}
/* API protected
async function fetchProfile(actorName, scraperSlug, site) {
async function fetchProfile(actorName, context , site) {
const session = bhttp.session();
await session.get(`https://tour.${site.slug}.com`);

View File

@ -222,8 +222,8 @@ async function fetchScene(url, site) {
return null;
}
async function fetchProfile(actorName, networkName, actorPath = 'model') {
const url = `https://www.${networkName}.com`;
async function fetchProfile(actorName, networkSlug, actorPath = 'model') {
const url = `https://www.${networkSlug}.com`;
const { session, instanceToken } = await getSession(url);
const res = await session.get(`https://site-api.project1service.com/v1/actors/?search=${encodeURI(actorName)}`, {
@ -236,7 +236,7 @@ async function fetchProfile(actorName, networkName, actorPath = 'model') {
const actorData = res.body.result.find(actor => actor.name.toLowerCase() === actorName.toLowerCase());
if (actorData) {
const actorUrl = `https://www.${networkName}.com/${actorPath}/${actorData.id}/`;
const actorUrl = `https://www.${networkSlug}.com/${actorPath}/${actorData.id}/`;
const actorReleasesUrl = `https://site-api.project1service.com/v2/releases?actorId=${actorData.id}&limit=100&offset=0&orderBy=-dateReleased&type=scene`;
const [actorRes, actorReleasesRes] = await Promise.all([
@ -249,11 +249,11 @@ async function fetchProfile(actorName, networkName, actorPath = 'model') {
]);
if (actorRes.statusCode === 200 && actorReleasesRes.statusCode === 200 && actorReleasesRes.body.result) {
return scrapeProfile(actorData, actorRes.body.toString(), actorReleasesRes.body.result, networkName);
return scrapeProfile(actorData, actorRes.body.toString(), actorReleasesRes.body.result, networkSlug);
}
if (actorRes.statusCode === 200) {
return scrapeProfile(actorData, actorRes.body.toString(), null, networkName);
return scrapeProfile(actorData, actorRes.body.toString(), null, networkSlug);
}
}
}

View File

@ -136,9 +136,9 @@ async function fetchScene(url, site) {
return res.ok ? scrapeScene(res.item, url, site) : res.status;
}
async function fetchProfile(actorName, siteSlug) {
async function fetchProfile(actorName, { site }) {
const firstLetter = actorName.charAt(0).toLowerCase();
const origin = slugUrlMap[siteSlug] || `https://www.${siteSlug}.com`;
const origin = slugUrlMap[site.slug] || site.url;
const url = `${origin}/model/alpha/${firstLetter}`;
const resModels = await get(url);

View File

@ -217,7 +217,7 @@ async function fetchScene(url, site) {
return null;
}
async function fetchProfile(actorName, scraperSlug, site, include, page = 1, source = 0) {
async function fetchProfile(actorName, context, include, page = 1, source = 0) {
const letter = actorName.charAt(0).toUpperCase();
const sources = [
@ -244,11 +244,11 @@ async function fetchProfile(actorName, scraperSlug, site, include, page = 1, sou
return null;
}
return fetchProfile(actorName, scraperSlug, site, include, page + 1, source);
return fetchProfile(actorName, context, include, page + 1, source);
}
if (sources[source + 1]) {
return fetchProfile(actorName, scraperSlug, site, include, 1, source + 1);
return fetchProfile(actorName, context, include, 1, source + 1);
}
return null;

View File

@ -154,12 +154,12 @@ module.exports = {
ddfnetwork,
deeper: vixen,
deeplush: nubiles,
devilsfilm: famedigital,
digitalplayground,
dtfsluts: fullpornnetwork,
evilangel,
eyeontheguy: hush,
fakehub,
famedigital,
freeones,
gangbangcreampie: aziani,
girlfaction: fullpornnetwork,
@ -189,15 +189,19 @@ module.exports = {
nympho: mikeadriano,
onlyprince: fullpornnetwork,
pervertgallery: fullpornnetwork,
peternorth: famedigital,
pimpxxx: cherrypimps,
pornhub,
povperverts: fullpornnetwork,
povpornstars: hush,
private: privateNetwork,
realitykings,
roccosiffredi: famedigital,
score,
seehimfuck: hush,
sexyhub: mindgeek,
silverstonedvd: famedigital,
silviasaint: famedigital,
swallowed: mikeadriano,
thatsitcomshow: nubiles,
transangels,
@ -207,6 +211,7 @@ module.exports = {
twistys,
vixen,
wicked,
wildoncam: cherrypimps,
xempire,
},
};

View File

@ -232,8 +232,8 @@ async function fetchScene(url, site, baseRelease) {
return res.code;
}
async function fetchProfile(actorName, scraperSlug, site, include) {
const origin = `https://www.${scraperSlug}.com`;
async function fetchProfile(actorName, { site }, include) {
const origin = site.url;
const actorSlug = slugify(actorName);
const url = `${origin}/api/${actorSlug}`;
const res = await get(url);

View File

@ -23,8 +23,8 @@ function getActorReleasesUrl(actorPath, page = 1) {
return `https://www.xempire.com/en/videos/xempire/latest/${page}/All-Categories/0${actorPath}`;
}
async function networkFetchProfile(actorName, scraperSlug, site, include) {
return fetchProfile(actorName, scraperSlug, null, getActorReleasesUrl, include);
async function networkFetchProfile(actorName, context, include) {
return fetchProfile(actorName, context, null, getActorReleasesUrl, include);
}
module.exports = {