Passing matching site to profile scrapers. Allowing scrapers to pass avatar metadata. Added scraper and copyright properties to media. Auto-adding copyright from site or scraper to avatars. Separated Porn Pros from Whale Member.

This commit is contained in:
ThePendulum 2020-02-24 03:12:58 +01:00
parent 73443b77a8
commit 6d1f30f703
40 changed files with 232 additions and 123 deletions

View File

@ -43,6 +43,7 @@
>
<img
:src="`/media/${actor.avatar.thumbnail}`"
:title="actor.avatar.copyright && `© ${actor.avatar.copyright}`"
class="avatar"
>
</a>

View File

@ -12,6 +12,7 @@
>
<img
:src="`/media/${actor.avatar.thumbnail}`"
:title="actor.avatar.copyright && `© ${actor.avatar.copyright}`"
class="avatar photo"
>
</a>
@ -26,6 +27,7 @@
>
<img
:src="`/media/${photo.thumbnail}`"
:title="photo.copyright && `© ${photo.copyright}`"
class="photo"
>
</a>

View File

@ -38,11 +38,13 @@
:class="{ expanded }"
/>
<div class="networks">
<Network
v-for="childNetwork in networks"
:key="`network-${childNetwork.id}`"
:network="childNetwork"
/>
</div>
<Network
v-if="network.parent"

View File

@ -80,6 +80,7 @@ function initActorActions(store, _router) {
media {
thumbnail
path
copyright
}
}
photos: actorsPhotos {
@ -88,6 +89,7 @@ function initActorActions(store, _router) {
thumbnail
path
index
copyright
}
}
birthCity

View File

@ -31,6 +31,8 @@ exports.up = knex => Promise.resolve()
table.float('entropy');
table.text('comment');
table.string('scraper', 32);
table.string('copyright', 100);
table.string('source', 1000);
table.unique('hash');

Binary file not shown.

Before

Width:  |  Height:  |  Size: 11 KiB

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.2 KiB

View File

Before

Width:  |  Height:  |  Size: 4.3 KiB

After

Width:  |  Height:  |  Size: 4.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

View File

Before

Width:  |  Height:  |  Size: 33 KiB

After

Width:  |  Height:  |  Size: 33 KiB

View File

Before

Width:  |  Height:  |  Size: 35 KiB

After

Width:  |  Height:  |  Size: 35 KiB

View File

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 24 KiB

View File

Before

Width:  |  Height:  |  Size: 30 KiB

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 862 B

View File

Before

Width:  |  Height:  |  Size: 28 KiB

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.2 KiB

View File

Before

Width:  |  Height:  |  Size: 5.3 KiB

After

Width:  |  Height:  |  Size: 5.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

View File

Before

Width:  |  Height:  |  Size: 32 KiB

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 KiB

View File

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 36 KiB

View File

Before

Width:  |  Height:  |  Size: 26 KiB

After

Width:  |  Height:  |  Size: 26 KiB

View File

Before

Width:  |  Height:  |  Size: 30 KiB

After

Width:  |  Height:  |  Size: 30 KiB

View File

Before

Width:  |  Height:  |  Size: 2.8 KiB

After

Width:  |  Height:  |  Size: 2.8 KiB

View File

Before

Width:  |  Height:  |  Size: 23 KiB

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

View File

@ -783,6 +783,10 @@ const tags = [
name: 'voyeur',
slug: 'voyeur',
},
{
name: 'virtual reality',
slug: 'virtual-reality',
},
{
name: 'wet',
slug: 'wet',
@ -1461,6 +1465,10 @@ const aliases = [
name: 'ts',
for: 'transsexual',
},
{
name: 'vr',
for: 'virtual reality',
},
{
name: 'whipping',
for: 'corporal-punishment',

View File

@ -13,6 +13,10 @@ const parentNetworks = [
url: 'https://www.mindgeek.com',
description: '',
},
{
slug: 'whalemember',
name: 'Whale Member',
},
];
const networks = [
@ -262,6 +266,7 @@ const networks = [
name: 'Porn Pros',
url: 'https://pornpros.com',
description: 'Watch the best HD exclusive movies and videos on Porn Pros. All the hottest new Pornstar and amateur girls in High Definition updated daily.',
parent: 'whalemember',
},
{
slug: 'private',

View File

@ -3620,91 +3620,6 @@ const sites = [
url: 'https://pornpros.com/site/pimpparade',
network: 'pornpros',
},
{
name: 'Cum 4K',
slug: 'cum4k',
url: 'https://cum4k.com',
tags: ['fake-cum', 'creampie', '4k'],
network: 'pornpros',
},
{
name: 'Tiny 4K',
slug: 'tiny4k',
url: 'https://tiny4k.com',
tags: ['4k'],
network: 'pornpros',
},
{
name: 'POVD',
slug: 'povd',
url: 'https://povd.com',
tags: ['pov'],
network: 'pornpros',
},
{
name: 'Lubed',
slug: 'lubed',
url: 'https://lubed.com',
tags: ['oil'],
network: 'pornpros',
},
{
name: 'Casting Couch X',
slug: 'castingcouchx',
url: 'https://castingcouch-x.com',
network: 'pornpros',
},
{
name: 'Passion HD',
slug: 'passionhd',
url: 'https://passion-hd.com',
network: 'pornpros',
},
{
name: 'Nanny Spy',
slug: 'nannyspy',
url: 'https://nannyspy.com',
network: 'pornpros',
},
{
name: 'Girl Cum',
slug: 'girlcum',
url: 'https://girlcum.com',
network: 'pornpros',
},
{
name: 'Pure Mature',
slug: 'puremature',
url: 'https://puremature.com',
tags: ['milf'],
network: 'pornpros',
},
{
name: 'Fantasy HD',
slug: 'fantasyhd',
url: 'https://fantasyhd.com',
network: 'pornpros',
},
{
name: 'Spy Fam',
slug: 'spyfam',
url: 'https://spyfam.com',
tags: ['family'],
network: 'pornpros',
},
{
name: 'Exotic 4K',
slug: 'exotic4k',
url: 'https://exotic4k.com',
tags: ['4k'],
network: 'pornpros',
},
{
name: 'Baeb',
slug: 'baeb',
url: 'https://baeb.com',
network: 'pornpros',
},
{
name: 'MILF Humiliation',
slug: 'milfhumiliation',
@ -3761,12 +3676,18 @@ const sites = [
network: 'pornpros',
scrape: false,
},
{
name: 'Webcam Hackers',
slug: 'webcamhackers',
url: 'https://webcamhackers.com',
network: 'pornpros',
scrape: false,
},
{
name: 'College Teens',
slug: 'collegeteens',
network: 'pornpros',
scrape: false,
show: false,
},
// PRIVATE
{
@ -5420,6 +5341,119 @@ const sites = [
description: 'Top rated models. Graceful locations. Best gonzo scenes. 4K UHD 60 FPS. So, in general Vogov is a website that is worth visiting and exploring carefully. It gives a chance to spend a fantastic night with gorgeous girls ready to experiment and to full around with their lovers.',
network: 'vogov',
},
// WHALE MEMBER
{
name: 'Cum 4K',
slug: 'cum4k',
url: 'https://cum4k.com',
tags: ['fake-cum', 'creampie', '4k'],
network: 'whalemember',
},
{
name: 'Tiny 4K',
slug: 'tiny4k',
url: 'https://tiny4k.com',
tags: ['4k'],
network: 'whalemember',
},
{
name: 'POVD',
slug: 'povd',
url: 'https://povd.com',
tags: ['pov'],
network: 'whalemember',
},
{
name: 'Lubed',
slug: 'lubed',
url: 'https://lubed.com',
tags: ['oil'],
network: 'whalemember',
},
{
name: 'Casting Couch X',
slug: 'castingcouchx',
url: 'https://castingcouch-x.com',
network: 'whalemember',
},
{
name: 'Passion HD',
slug: 'passionhd',
url: 'https://passion-hd.com',
network: 'whalemember',
},
{
name: 'Nanny Spy',
slug: 'nannyspy',
url: 'https://nannyspy.com',
network: 'whalemember',
},
{
name: 'Girl Cum',
slug: 'girlcum',
url: 'https://girlcum.com',
network: 'whalemember',
},
{
name: 'Pure Mature',
slug: 'puremature',
url: 'https://puremature.com',
tags: ['milf'],
network: 'whalemember',
},
{
name: 'Fantasy HD',
slug: 'fantasyhd',
url: 'https://fantasyhd.com',
network: 'whalemember',
},
{
name: 'Spy Fam',
slug: 'spyfam',
url: 'https://spyfam.com',
tags: ['family'],
network: 'whalemember',
},
{
name: 'Holed',
slug: 'holed',
url: 'https://holed.com',
tags: ['anal'],
network: 'whalemember',
},
{
name: 'BBC Pie',
slug: 'bbcpie',
url: 'https://bbcpie.com',
tags: ['bbc', 'interracial'],
network: 'whalemember',
},
{
name: 'Wet VR',
slug: 'wetvr',
url: 'https://wetvr.com',
tags: ['virtual-reality'],
network: 'whalemember',
},
{
name: 'Exotic 4K',
slug: 'exotic4k',
url: 'https://exotic4k.com',
tags: ['4k'],
network: 'whalemember',
},
{
name: 'My Very First Time',
slug: 'myveryfirsttime',
url: 'https://myveryfirsttime.com',
network: 'whalemember',
},
{
name: 'Baeb',
slug: 'baeb',
url: 'https://baeb.com',
network: 'whalemember',
},
// WICKED
{
slug: 'wicked',

View File

@ -12,6 +12,7 @@ const scrapers = require('./scrapers/scrapers');
const whereOr = require('./utils/where-or');
const resolvePlace = require('./utils/resolve-place');
const slugify = require('./utils/slugify');
const capitalize = require('./utils/capitalize');
// const { createMediaDirectory, storePhotos } = require('./media_legacy');
const { storeMedia, associateMedia } = require('./media');
@ -94,10 +95,7 @@ function curateActors(releases) {
function curateActorEntry(actor, scraped, scrapeSuccess) {
const curatedActor = {
name: actor.name
.split(' ')
.map(segment => `${segment.charAt(0).toUpperCase()}${segment.slice(1)}`)
.join(' '),
name: capitalize(actor.name),
slug: slugify(actor.name),
birthdate: actor.birthdate,
description: actor.description,
@ -305,12 +303,12 @@ async function mergeProfiles(profiles, actor) {
return prevProfile;
}
return {
const accProfile = {
id: actor ? actor.id : null,
name: actor ? actor.name : (prevProfile.name || profile.name),
description: prevProfile.description || profile.description,
gender: prevProfile.gender || profile.gender,
birthdate: Number.isNaN(Number(prevProfile.birthdate)) ? profile.birthdate : prevProfile.birthdate,
birthdate: !prevProfile.birthdate || Number.isNaN(Number(prevProfile.birthdate)) ? profile.birthdate : prevProfile.birthdate,
birthPlace: prevProfile.birthPlace || profile.birthPlace,
residencePlace: prevProfile.residencePlace || profile.residencePlace,
nationality: prevProfile.nationality || profile.nationality, // used to derive country when not available
@ -328,9 +326,28 @@ async function mergeProfiles(profiles, actor) {
piercings: prevProfile.piercings || profile.piercings,
tattoos: prevProfile.tattoos || profile.tattoos,
social: prevProfile.social.concat(profile.social || []),
avatars: prevProfile.avatars.concat(profile.avatar ? [{ src: profile.avatar }] : []), // don't flatten fallbacks
releases: prevProfile.releases.concat(profile.releases ? profile.releases : []), // don't flatten fallbacks
};
if (profile.avatar) {
const avatar = Array.isArray(profile.avatar)
? profile.avatar.map(avatarX => ({
src: avatarX.src || avatarX,
scraper: profile.scraper,
copyright: avatarX.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
}))
: {
src: profile.avatar.src || profile.avatar,
scraper: profile.scraper,
copyright: profile.avatar.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright,
};
accProfile.avatars = prevProfile.avatars.concat([avatar]); // don't flatten fallbacks
} else {
accProfile.avatars = prevProfile.avatars;
}
return accProfile;
}, {
social: [],
avatars: [],
@ -368,6 +385,9 @@ async function scrapeActors(actorNames) {
const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested
const [sites, networks] = await Promise.all([knex('sites').select('*').whereIn('slug', finalSources.flat()), knex('networks').select('*').whereIn('slug', finalSources.flat())]);
const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
const profiles = await Promise.map(finalSources, async (source) => {
// const [scraperSlug, scraper] = source;
const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] }));
@ -381,7 +401,8 @@ async function scrapeActors(actorNames) {
logger.verbose(`Searching '${actorName}' on ${scraperSlug}`);
const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, argv.withReleases);
const site = sitesBySlug[scraperSlug] || null;
const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, site, argv.withReleases);
if (profile) {
logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`);
@ -390,6 +411,7 @@ async function scrapeActors(actorNames) {
...profile,
name: actorName,
scraper: scraperSlug,
site,
};
}

View File

@ -137,6 +137,8 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att
entropy,
quality: source.quality || null,
source: originalSource?.src || originalSource || source.src || source,
scraper: source.scraper,
copyright: source.copyright,
};
}
@ -191,6 +193,8 @@ async function saveItems(items, domain, role) {
extension: item.extension,
hash: item.hash,
entropy: item.entropy,
scraper: item.scraper,
copyright: item.copyright,
quality: item.quality,
source: item.source,
};
@ -204,7 +208,9 @@ async function saveItems(items, domain, role) {
extension: item.extension,
hash: item.hash,
entropy: item.entropy,
scraper: item.scraper,
quality: item.quality,
copyright: item.copyright,
source: item.source,
};
} catch (error) {
@ -220,8 +226,10 @@ function curateItemEntries(items) {
thumbnail: item.thumbpath,
mime: item.mimetype,
hash: item.hash,
source: item.source,
entropy: item.entropy,
source: item.source,
scraper: item.scraper,
copyright: item.copyright,
index,
}));
}
@ -308,16 +316,19 @@ function extractPrimaryItem(associations, targetId, role, primaryRole, primaryIt
function associateTargetMedia(targetId, sources, mediaBySource, domain, role, primaryRole, primaryItemsByTargetId) {
if (!sources) return { [role]: null, [primaryRole]: null };
const associations = sources
const mediaIds = sources
.filter(Boolean)
.map((source) => {
const mediaItem = Array.isArray(source)
? source.reduce((acc, sourceX) => acc || mediaBySource[sourceX.src || sourceX], null)
: mediaBySource[source.src || source];
return mediaItem && { [`${domain}_id`]: targetId, media_id: mediaItem.id };
})
.filter(Boolean);
// return mediaItem && { [`${domain}_id`]: targetId, media_id: mediaItem.id };
return mediaItem && mediaItem.id;
});
const uniqueMediaIds = Array.from(new Set(mediaIds.filter(Boolean)));
const associations = uniqueMediaIds.map(mediaId => ({ [`${domain}_id`]: targetId, media_id: mediaId }));
logger.silly(`Associating ${associations.length} ${role}s to ${domain} ${targetId}`);

View File

@ -67,7 +67,10 @@ function scrapeProfile(html) {
if (avatarThumbPath && !/NoImageAvailable/.test(avatarThumbPath)) {
const avatarPath = avatarThumbPath.slice(0, avatarThumbPath.lastIndexOf('/')).replace('thumb/', '');
profile.avatar = `http://www.boobpedia.com${avatarPath}`;
profile.avatar = {
src: `http://www.boobpedia.com${avatarPath}`,
copyright: null,
};
}
profile.social = qus('.infobox a.external');

View File

@ -52,7 +52,7 @@ function scrapeProfile(html, actorName) {
profile.social = Array.from(document.querySelectorAll('.profile-meta-item a.social-icons'), el => el.href);
const avatar = document.querySelector('.profile-image-large img').src;
if (!avatar.match('placeholder')) profile.avatar = document.querySelector('.profile-image-large img').src;
if (!avatar.match('placeholder')) profile.avatar = { src: avatar, copyright: null };
return profile;
}

View File

@ -8,6 +8,7 @@ const moment = require('moment');
const logger = require('../logger');
const { heightToCm } = require('../utils/convert');
const slugify = require('../utils/slugify');
async function fetchPhotos(url) {
const res = await bhttp.get(url);
@ -302,15 +303,15 @@ function scrapeProfile(html, url, actorName) {
if (measurementsString) [profile.bust, profile.waist, profile.hip] = measurementsString[0].split('-');
if (avatarEl) {
const src = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src') + 5, avatarEl.innerHTML.indexOf('set.jpg') + 7).trim();
const src0 = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0') + 6, avatarEl.innerHTML.indexOf('set.jpg') + 7).trim();
const src1 = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_1x') + 9, avatarEl.innerHTML.indexOf('1x.jpg') + 6).trim();
const src2 = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_2x') + 9, avatarEl.innerHTML.indexOf('2x.jpg') + 6).trim();
const src3 = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_3x') + 9, avatarEl.innerHTML.indexOf('3x.jpg') + 6).trim();
const avatarSources = [
avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_3x') + 9, avatarEl.innerHTML.indexOf('3x.jpg') + 6).trim(),
avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_2x') + 9, avatarEl.innerHTML.indexOf('2x.jpg') + 6).trim(),
avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_1x') + 9, avatarEl.innerHTML.indexOf('1x.jpg') + 6).trim(),
avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0') + 6, avatarEl.innerHTML.indexOf('set.jpg') + 7).trim(),
avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src') + 5, avatarEl.innerHTML.indexOf('set.jpg') + 7).trim(),
].filter(Boolean);
const avatar = src3 || src2 || src1 || src0 || src;
if (avatar) profile.avatar = avatar;
if (avatarSources.length) profile.avatar = avatarSources;
}
profile.releases = Array.from(document.querySelectorAll('.category_listing_block .update_details > a:first-child'), el => el.href);
@ -343,8 +344,8 @@ async function fetchMovie(url, site) {
}
async function fetchProfile(actorName) {
const actorSlugA = actorName.toLowerCase().replace(/\s+/g, '-');
const actorSlugB = actorName.toLowerCase().replace(/\s+/g, '');
const actorSlugA = slugify(actorName, { delimiter: '-' });
const actorSlugB = slugify(actorName, { delimiter: '' });
const urlA = `https://julesjordan.com/trial/models/${actorSlugA}.html`;
const urlB = `https://julesjordan.com/trial/models/${actorSlugB}.html`;

View File

@ -6,7 +6,6 @@ const babes = require('./babes');
const bang = require('./bang');
const bangbros = require('./bangbros');
const blowpass = require('./blowpass');
const boobpedia = require('./boobpedia');
const brazzers = require('./brazzers');
const burningangel = require('./burningangel');
const cherrypimps = require('./cherrypimps');
@ -17,8 +16,6 @@ const evilangel = require('./evilangel');
const fakehub = require('./fakehub');
const famedigital = require('./famedigital');
const fantasymassage = require('./fantasymassage');
const freeones = require('./freeones');
const freeonesLegacy = require('./freeones_legacy');
const fullpornnetwork = require('./fullpornnetwork');
const girlsway = require('./girlsway');
const iconmale = require('./iconmale');
@ -41,7 +38,7 @@ const nubiles = require('./nubiles');
const perfectgonzo = require('./perfectgonzo');
const pervcity = require('./pervcity');
const pornhub = require('./pornhub');
const pornpros = require('./pornpros');
const whalemember = require('./whalemember');
const privateNetwork = require('./private'); // reserved keyword
const puretaboo = require('./puretaboo');
const realitykings = require('./realitykings');
@ -57,6 +54,11 @@ const vogov = require('./vogov');
const wicked = require('./wicked');
const xempire = require('./xempire');
// profiles
const boobpedia = require('./boobpedia');
const freeones = require('./freeones');
const freeonesLegacy = require('./freeones_legacy');
module.exports = {
releases: {
'21naturals': naturals,
@ -99,7 +101,7 @@ module.exports = {
perfectgonzo,
pervcity,
pimpxxx: cherrypimps,
pornpros,
pornpros: whalemember,
private: privateNetwork,
puretaboo,
realitykings,
@ -109,6 +111,7 @@ module.exports = {
vivid,
vixen,
vogov,
whalemember,
wicked,
xempire,
},

View File

@ -214,7 +214,7 @@ async function fetchScene(url, site, baseRelease) {
return res.code;
}
async function fetchProfile(actorName, scraperSlug, withReleases) {
async function fetchProfile(actorName, scraperSlug, site, withReleases) {
const origin = `https://www.${scraperSlug}.com`;
const actorSlug = slugify(actorName);
const url = `${origin}/api/${actorSlug}`;

10
src/utils/capitalize.js Normal file
View File

@ -0,0 +1,10 @@
'use strict';
function capitalize(string) {
return string
.split(/\s/)
.map(component => `${component.charAt(0).toUpperCase()}${component.slice(1)}`)
.join(' ');
}
module.exports = capitalize;

View File

@ -21,7 +21,10 @@ async function init() {
await Promise.all(posters.map(async (poster) => {
const source = path.join(config.media.path, poster.path);
const target = path.join(config.media.path, 'posters', `${poster.actor_name} - ${poster.network_name}: ${poster.site_name} - ${poster.title.replace(/[/.]/g, '_')} (${moment.utc(poster.date).format('YYYY-MM-DD')}).jpeg`);
const directory = path.join(config.media.path, 'extracted', poster.actor_name);
const target = path.join(directory, `${poster.actor_name} - ${poster.network_name}: ${poster.site_name} - ${poster.title.replace(/[/.]/g, '_')} (${moment.utc(poster.date).format('YYYY-MM-DD')}).jpeg`);
await fs.mkdir(path.join(directory), { recursive: true });
const file = await fs.readFile(source);
await fs.writeFile(target, file);