Scraping and storing actor profiles.
This commit is contained in:
parent
11eb66f834
commit
21d4dd6bfa
|
@ -76,7 +76,7 @@ function initActorActions(store, _router) {
|
|||
name
|
||||
slug
|
||||
gender
|
||||
birthdate
|
||||
birthdate: dateOfBirth
|
||||
age
|
||||
ethnicity
|
||||
bust
|
||||
|
@ -229,7 +229,7 @@ function initActorActions(store, _router) {
|
|||
name
|
||||
slug
|
||||
age
|
||||
birthdate
|
||||
birthdate: dateOfBirth
|
||||
gender
|
||||
network {
|
||||
id
|
||||
|
|
|
@ -34,7 +34,7 @@ const actorFields = `
|
|||
id
|
||||
name
|
||||
slug
|
||||
birthdate
|
||||
birthdate: dateOfBirth
|
||||
age
|
||||
gender
|
||||
network {
|
||||
|
|
|
@ -272,7 +272,9 @@ exports.up = knex => Promise.resolve()
|
|||
.references('id')
|
||||
.inTable('actors');
|
||||
|
||||
table.date('birthdate');
|
||||
table.date('date_of_birth');
|
||||
table.date('date_of_death');
|
||||
|
||||
table.string('gender', 18);
|
||||
table.text('description');
|
||||
|
||||
|
@ -290,7 +292,8 @@ exports.up = knex => Promise.resolve()
|
|||
|
||||
table.string('ethnicity');
|
||||
|
||||
table.string('bust', 10);
|
||||
table.string('cup', 4);
|
||||
table.integer('bust', 3);
|
||||
table.integer('waist', 3);
|
||||
table.integer('hip', 3);
|
||||
table.boolean('natural_boobs');
|
||||
|
@ -330,10 +333,11 @@ exports.up = knex => Promise.resolve()
|
|||
.references('id')
|
||||
.inTable('sites');
|
||||
|
||||
table.unique(['actor_id', 'network_id']);
|
||||
table.unique(['actor_id', 'site_id']);
|
||||
table.unique(['actor_id', 'network_id', 'site_id']);
|
||||
|
||||
table.date('date_of_birth');
|
||||
table.date('date_of_death');
|
||||
|
||||
table.date('birthdate');
|
||||
table.string('gender', 18);
|
||||
table.text('description');
|
||||
|
||||
|
@ -351,7 +355,8 @@ exports.up = knex => Promise.resolve()
|
|||
|
||||
table.string('ethnicity');
|
||||
|
||||
table.string('bust', 10);
|
||||
table.string('cup', 4);
|
||||
table.integer('bust', 3);
|
||||
table.integer('waist', 3);
|
||||
table.integer('hip', 3);
|
||||
table.boolean('natural_boobs');
|
||||
|
|
Binary file not shown.
After Width: | Height: | Size: 29 KiB |
Binary file not shown.
After Width: | Height: | Size: 27 KiB |
240
src/actors.js
240
src/actors.js
|
@ -5,11 +5,16 @@ const Promise = require('bluebird');
|
|||
|
||||
// const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const scrapers = require('./scrapers/scrapers');
|
||||
const scrapers = require('./scrapers/scrapers').actors;
|
||||
|
||||
const argv = require('./argv');
|
||||
const include = require('./utils/argv-include')(argv);
|
||||
const logger = require('./logger')(__filename);
|
||||
const slugify = require('./utils/slugify');
|
||||
const capitalize = require('./utils/capitalize');
|
||||
const resolvePlace = require('./utils/resolve-place');
|
||||
|
||||
const { toBaseReleases } = require('./deep');
|
||||
|
||||
function toBaseActors(actorsOrNames, release) {
|
||||
return actorsOrNames.map((actorOrName) => {
|
||||
|
@ -46,6 +51,224 @@ function curateActorEntries(baseActors, batchId) {
|
|||
return baseActors.map(baseActor => curateActorEntry(baseActor, batchId));
|
||||
}
|
||||
|
||||
function curateProfileEntry(profile) {
|
||||
const curatedProfileEntry = {
|
||||
actor_id: profile.id,
|
||||
site_id: profile.site?.id || null,
|
||||
network_id: profile.network?.id || null,
|
||||
date_of_birth: profile.dateOfBirth,
|
||||
date_of_death: profile.dateOfDeath,
|
||||
gender: profile.gender,
|
||||
ethnicity: profile.ethnicity,
|
||||
description: profile.description,
|
||||
birth_city: profile.placeOfBirth?.city || null,
|
||||
birth_state: profile.placeOfBirth?.state || null,
|
||||
birth_country_alpha2: profile.placeOfBirth?.country?.alpha2 || null,
|
||||
residence_city: profile.placeOfResidence?.city || null,
|
||||
residence_state: profile.placeOfResidence?.state || null,
|
||||
residence_country_alpha2: profile.placeOfResidence?.country?.alpha2 || null,
|
||||
cup: profile.cup,
|
||||
bust: profile.bust,
|
||||
waist: profile.waist,
|
||||
hip: profile.hip,
|
||||
natural_boobs: profile.naturalBoobs,
|
||||
height: profile.height,
|
||||
weight: profile.weight,
|
||||
hair: profile.hair,
|
||||
eyes: profile.eyes,
|
||||
has_tattoos: profile.hasTattoos,
|
||||
has_piercings: profile.hasPiercings,
|
||||
piercings: profile.piercings,
|
||||
tattoos: profile.tattoos,
|
||||
};
|
||||
|
||||
return curatedProfileEntry;
|
||||
}
|
||||
|
||||
async function curateProfile(profile) {
|
||||
try {
|
||||
const curatedProfile = {
|
||||
id: profile.id,
|
||||
name: profile.name,
|
||||
avatar: profile.avatar,
|
||||
};
|
||||
|
||||
curatedProfile.site = profile.site.isNetwork ? null : profile.site;
|
||||
curatedProfile.network = profile.site.isNetwork ? profile.site : null;
|
||||
|
||||
curatedProfile.description = profile.description?.trim() || null;
|
||||
curatedProfile.nationality = profile.nationality?.trim() || null; // used to derive country when country not available
|
||||
curatedProfile.ethnicity = profile.ethnicity?.trim() || null;
|
||||
curatedProfile.hair = profile.hair?.trim() || null;
|
||||
curatedProfile.eyes = profile.eyes?.trim() || null;
|
||||
curatedProfile.tattoos = profile.tattoos?.trim() || null;
|
||||
curatedProfile.piercings = profile.piercings?.trim() || null;
|
||||
|
||||
curatedProfile.gender = (/female/i.test(profile.gender) && 'female')
|
||||
|| (/shemale/i.test(profile.gender) && 'transsexual')
|
||||
|| (/male/i.test(profile.gender) && 'male')
|
||||
|| (/trans/i.test(profile.gender) && 'transsexual')
|
||||
|| null;
|
||||
|
||||
curatedProfile.dateOfBirth = (!Number.isNaN(Number(profile.dateOfBirth || profile.birthdate)) // possibly valid date
|
||||
&& new Date() - profile.birthdate > 567648000000 // over 18
|
||||
&& profile.birthdate)
|
||||
|| null;
|
||||
|
||||
curatedProfile.dateOfDeath = Number.isNaN(Number(profile.dateOfDeath)) ? null : profile.dateOfDeath;
|
||||
|
||||
curatedProfile.cup = profile.cup || profile.bust?.match(/[a-zA-Z]+/)?.[0] || null;
|
||||
curatedProfile.bust = Number(profile.bust) || profile.bust?.match(/\d+/)?.[0] || null;
|
||||
curatedProfile.waist = Number(profile.waist) || profile.waist?.match(/\d+/)?.[0] || null;
|
||||
curatedProfile.hip = Number(profile.hip) || profile.hip?.match(/\d+/)?.[0] || null;
|
||||
curatedProfile.height = Number(profile.height) || profile.height?.match(/\d+/)?.[0] || null;
|
||||
curatedProfile.weight = Number(profile.weight) || profile.weight?.match(/\d+/)?.[0] || null;
|
||||
|
||||
curatedProfile.naturalBoobs = typeof profile.naturalBoobs === 'boolean' ? profile.naturalBoobs : null;
|
||||
curatedProfile.hasTattoos = typeof profile.hasTattoos === 'boolean' ? profile.hasTattoos : null;
|
||||
curatedProfile.hasPiercings = typeof profile.hasPiercings === 'boolean' ? profile.hasPiercings : null;
|
||||
|
||||
const [placeOfBirth, placeOfResidence] = await Promise.all([
|
||||
resolvePlace(profile.birthPlace),
|
||||
resolvePlace(profile.residencePlace),
|
||||
]);
|
||||
|
||||
curatedProfile.placeOfBirth = placeOfBirth;
|
||||
curatedProfile.placeOfResidence = placeOfResidence;
|
||||
|
||||
if (!curatedProfile.placeOfBirth && curatedProfile.nationality) {
|
||||
const country = await knex('countries')
|
||||
.where('nationality', 'ilike', `%${curatedProfile.nationality}%`)
|
||||
.orderBy('priority', 'desc')
|
||||
.first();
|
||||
|
||||
curatedProfile.placeOfBirth = {
|
||||
country: country.alpha2,
|
||||
};
|
||||
}
|
||||
|
||||
curatedProfile.social = Array.isArray(profile.social)
|
||||
? profile.social.map((social) => {
|
||||
try {
|
||||
const { href } = new URL();
|
||||
return href;
|
||||
} catch (error) {
|
||||
logger.warn(`Profile scraper for '${profile.site.name}' returned invalid social link: ${social}`);
|
||||
return null;
|
||||
}
|
||||
}).filter(Boolean)
|
||||
: [];
|
||||
|
||||
curatedProfile.releases = toBaseReleases(profile.releases);
|
||||
|
||||
return curatedProfile;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to curate '${profile.name}': ${error.message}`);
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
|
||||
const profiles = Promise.map(sources, async (source) => {
|
||||
try {
|
||||
return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
|
||||
const scraper = scrapers[scraperSlug];
|
||||
const siteOrNetwork = networksBySlug[scraperSlug] || sitesBySlug[scraperSlug];
|
||||
|
||||
if (!scraper?.fetchProfile) {
|
||||
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
||||
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
|
||||
}
|
||||
|
||||
if (!siteOrNetwork) {
|
||||
logger.warn(`No site or network found for ${scraperSlug}`);
|
||||
throw new Error(`No site or network found for ${scraperSlug}`);
|
||||
}
|
||||
|
||||
logger.verbose(`Searching profile for '${actor.name}' on '${scraperSlug}'`);
|
||||
|
||||
const profile = await scraper.fetchProfile(actor.name, scraperSlug, siteOrNetwork, include);
|
||||
|
||||
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
|
||||
logger.verbose(`Profile for '${actor.name}' not available on ${scraperSlug}, scraper returned ${profile}`);
|
||||
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${scraperSlug}`), { code: 'PROFILE_NOT_AVAILABLE' });
|
||||
}
|
||||
|
||||
return {
|
||||
...actor,
|
||||
...profile,
|
||||
site: siteOrNetwork,
|
||||
};
|
||||
}), Promise.reject(new Error()));
|
||||
} catch (error) {
|
||||
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
|
||||
logger.error(`Failed to fetch profile for '${actor.name}': ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
});
|
||||
|
||||
return profiles.filter(Boolean);
|
||||
}
|
||||
|
||||
async function upsertProfiles(curatedProfileEntries) {
|
||||
const existingProfiles = await knex('actors_profiles')
|
||||
.whereIn(['actor_id', 'network_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.network_id]))
|
||||
.orWhereIn(['actor_id', 'site_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.site_id]));
|
||||
|
||||
const existingProfilesByActorNetworkSiteIds = existingProfiles.reduce((acc, profile) => ({
|
||||
...acc,
|
||||
[profile.actor_id]: {
|
||||
...acc[profile.actor_id],
|
||||
[profile.network_id]: {
|
||||
...acc[profile.actor_id]?.[profile.network_id],
|
||||
[profile.site_id]: profile,
|
||||
},
|
||||
},
|
||||
}), {});
|
||||
|
||||
const { updatingProfileEntries, newProfileEntries } = curatedProfileEntries.reduce((acc, profile) => {
|
||||
const existingProfile = existingProfilesByActorNetworkSiteIds[profile.actor_id]?.[profile.network_id]?.[profile.site_id];
|
||||
|
||||
if (existingProfile) {
|
||||
return {
|
||||
...acc,
|
||||
updatingProfileEntries: [...acc.updatingProfileEntries, {
|
||||
...profile,
|
||||
id: existingProfile.id,
|
||||
}],
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
...acc,
|
||||
newProfileEntries: [...acc.newProfileEntries, profile],
|
||||
};
|
||||
}, {
|
||||
updatingProfileEntries: [],
|
||||
newProfileEntries: [],
|
||||
});
|
||||
|
||||
if (newProfileEntries.length > 0) {
|
||||
await knex('actors_profiles').insert(newProfileEntries);
|
||||
}
|
||||
|
||||
if (argv.force && updatingProfileEntries.length > 0) {
|
||||
knex.transaction(async (transaction) => {
|
||||
const queries = updatingProfileEntries.map(profileEntry => knex('actors_profiles')
|
||||
.where('id', profileEntry.id)
|
||||
.update(profileEntry)
|
||||
.transacting(transaction));
|
||||
|
||||
return Promise.all(queries)
|
||||
.then(transaction.commit)
|
||||
.catch(transaction.rollback);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async function scrapeActors(actorNames) {
|
||||
const baseActors = toBaseActors(actorNames);
|
||||
|
||||
|
@ -71,9 +294,20 @@ async function scrapeActors(actorNames) {
|
|||
const curatedActorEntries = batchId && curateActorEntries(newBaseActors, batchId);
|
||||
const newActorEntries = batchId && await knex('actors').insert(curatedActorEntries).returning(['id', 'name', 'slug']);
|
||||
|
||||
const actorEntries = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
|
||||
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
|
||||
|
||||
console.log(actorEntries, newActorEntries, actorEntries);
|
||||
// TODO: don't fetch existing profiles unless --force is used
|
||||
|
||||
const profilesPerActor = await Promise.map(
|
||||
actors,
|
||||
async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug),
|
||||
{ concurrency: 10 },
|
||||
);
|
||||
|
||||
const profiles = await Promise.all(profilesPerActor.flat().map(profile => curateProfile(profile)));
|
||||
const curatedProfileEntries = profiles.map(profile => curateProfileEntry(profile));
|
||||
|
||||
await upsertProfiles(curatedProfileEntries);
|
||||
}
|
||||
|
||||
async function getOrCreateActors(baseActors, batchId) {
|
||||
|
|
|
@ -167,4 +167,5 @@ module.exports = {
|
|||
fetchReleases,
|
||||
fetchScenes,
|
||||
fetchMovies,
|
||||
toBaseReleases,
|
||||
};
|
||||
|
|
|
@ -8,10 +8,10 @@ const moment = require('moment');
|
|||
const argv = require('../argv');
|
||||
const knex = require('../knex');
|
||||
|
||||
async function init() {
|
||||
async function actorPosters(actorNames) {
|
||||
const posters = await knex('actors')
|
||||
.select('actors.name as actor_name', 'releases.title', 'releases.date', 'media.path', 'media.index', 'sites.name as site_name', 'networks.name as network_name')
|
||||
.whereIn('actors.name', (argv.actors || []).concat(argv._))
|
||||
.whereIn('actors.name', actorNames)
|
||||
.join('releases_actors', 'releases_actors.actor_id', 'actors.id')
|
||||
.join('releases', 'releases_actors.release_id', 'releases.id')
|
||||
.join('sites', 'sites.id', 'releases.site_id')
|
||||
|
@ -37,4 +37,41 @@ async function init() {
|
|||
knex.destroy();
|
||||
}
|
||||
|
||||
async function sitePosters(siteSlugs) {
|
||||
const posters = await knex('sites')
|
||||
.select('sites.name as site_name', 'releases.title', 'releases.date', 'media.path')
|
||||
.whereIn('sites.slug', siteSlugs)
|
||||
.join('releases', 'releases.site_id', 'sites.id')
|
||||
.join('releases_posters', 'releases_posters.release_id', 'releases.id')
|
||||
.join('media', 'releases_posters.media_id', 'media.id');
|
||||
// .where('releases.date', '<', '2020-01-01');
|
||||
|
||||
const files = await Promise.all(posters.map(async (poster) => {
|
||||
const directory = path.join(config.media.path, 'extracted', poster.site_name);
|
||||
|
||||
const source = path.join(config.media.path, poster.path);
|
||||
const target = path.join(directory, `${poster.site_name} - ${moment.utc(poster.date).format('YYYY-MM-DD')} - ${poster.title.replace(/[/.]/g, '_')}.jpeg`);
|
||||
|
||||
await fs.mkdir(directory, { recursive: true });
|
||||
await fs.copyFile(source, target);
|
||||
|
||||
return target;
|
||||
}));
|
||||
|
||||
console.log(files);
|
||||
|
||||
knex.destroy();
|
||||
}
|
||||
|
||||
async function init() {
|
||||
if (argv.actors) {
|
||||
await actorPosters(argv.actors);
|
||||
return;
|
||||
}
|
||||
|
||||
if (argv.sites) {
|
||||
await sitePosters(argv.sites);
|
||||
}
|
||||
}
|
||||
|
||||
init();
|
||||
|
|
|
@ -1,25 +1,34 @@
|
|||
'use strict';
|
||||
|
||||
const bhttp = require('bhttp');
|
||||
const logger = require('../logger')(__filename);
|
||||
const http = require('./http');
|
||||
|
||||
async function resolvePlace(query) {
|
||||
if (!query) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const res = await bhttp.get(`https://nominatim.openstreetmap.org/search/${encodeURI(query)}?format=json&accept-language=en&addressdetails=1`);
|
||||
const [item] = res.body;
|
||||
try {
|
||||
// https://operations.osmfoundation.org/policies/nominatim/
|
||||
const res = await http.get(`https://nominatim.openstreetmap.org/search/${encodeURI(query)}?format=json&accept-language=en&addressdetails=1`, {
|
||||
'User-Agent': 'contact at moonloop.adult@protonmail.com',
|
||||
});
|
||||
|
||||
if (item && item.address) {
|
||||
const rawPlace = item.address;
|
||||
const place = {};
|
||||
const [item] = res.body;
|
||||
|
||||
if (rawPlace.city) place.city = rawPlace.city;
|
||||
if (rawPlace.state) place.state = rawPlace.state;
|
||||
if (rawPlace.country_code) place.country = rawPlace.country_code.toUpperCase();
|
||||
if (rawPlace.continent) place.continent = rawPlace.continent;
|
||||
if (item && item.address) {
|
||||
const rawPlace = item.address;
|
||||
const place = {};
|
||||
|
||||
return place;
|
||||
if (rawPlace.city) place.city = rawPlace.city;
|
||||
if (rawPlace.state) place.state = rawPlace.state;
|
||||
if (rawPlace.country_code) place.country = rawPlace.country_code.toUpperCase();
|
||||
if (rawPlace.continent) place.continent = rawPlace.continent;
|
||||
|
||||
return place;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to resolve place '${query}': ${error.message}`);
|
||||
}
|
||||
|
||||
return null;
|
||||
|
|
Loading…
Reference in New Issue