Fixed release sites for profile scraping.
This commit is contained in:
parent
9c8d914b75
commit
0d0acb6f3c
|
@ -401,6 +401,10 @@ async function scrapeProfiles(sources, actorName, actorEntry, sitesBySlug) {
|
||||||
name: actorName,
|
name: actorName,
|
||||||
scraper: scraperSlug,
|
scraper: scraperSlug,
|
||||||
site,
|
site,
|
||||||
|
releases: profile.releases?.map(release => (typeof release === 'string'
|
||||||
|
? { url: release, site }
|
||||||
|
: { ...release, site: release.site || site }
|
||||||
|
)),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -427,7 +431,7 @@ async function scrapeActors(actorNames) {
|
||||||
|
|
||||||
const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested
|
const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested
|
||||||
|
|
||||||
const [siteEntries, networks] = await Promise.all([
|
const [siteEntries, networkEntries] = await Promise.all([
|
||||||
knex('sites')
|
knex('sites')
|
||||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||||
.select(
|
.select(
|
||||||
|
@ -439,9 +443,10 @@ async function scrapeActors(actorNames) {
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const sites = await curateSites(siteEntries, true);
|
const sites = await curateSites(siteEntries, true);
|
||||||
|
const networks = networkEntries.map(network => ({ ...network, isFallback: true }));
|
||||||
const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
|
const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
|
||||||
|
|
||||||
const profiles = await scrapeProfiles(sources, sitesBySlug, actorName, actorEntry);
|
const profiles = await scrapeProfiles(sources, actorName, actorEntry, sitesBySlug);
|
||||||
const profile = await mergeProfiles(profiles, actorEntry);
|
const profile = await mergeProfiles(profiles, actorEntry);
|
||||||
|
|
||||||
if (profile === null) {
|
if (profile === null) {
|
||||||
|
|
|
@ -31,7 +31,7 @@ async function init() {
|
||||||
|
|
||||||
if (argv.withReleases) {
|
if (argv.withReleases) {
|
||||||
const baseReleases = actors.map(actor => actor?.releases || []).flat();
|
const baseReleases = actors.map(actor => actor?.releases || []).flat();
|
||||||
const releases = await deepFetchReleases(baseReleases, null, 'scene');
|
const releases = await deepFetchReleases(baseReleases, null);
|
||||||
|
|
||||||
await storeReleases(releases);
|
await storeReleases(releases);
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,7 +8,7 @@ const { JSDOM } = require('jsdom');
|
||||||
const moment = require('moment');
|
const moment = require('moment');
|
||||||
|
|
||||||
const logger = require('../logger')(__filename);
|
const logger = require('../logger')(__filename);
|
||||||
const { get, geta, ctxa } = require('../utils/q');
|
const { get, geta, ctxa, parseDate } = require('../utils/q');
|
||||||
const { heightToCm } = require('../utils/convert');
|
const { heightToCm } = require('../utils/convert');
|
||||||
const slugify = require('../utils/slugify');
|
const slugify = require('../utils/slugify');
|
||||||
|
|
||||||
|
@ -304,27 +304,37 @@ function scrapeProfile(html, url, actorName) {
|
||||||
const { document } = new JSDOM(html).window;
|
const { document } = new JSDOM(html).window;
|
||||||
|
|
||||||
const bio = document.querySelector('.model_bio').textContent;
|
const bio = document.querySelector('.model_bio').textContent;
|
||||||
const avatarEl = document.querySelector('.model_bio_pic');
|
const avatarEl = document.querySelector('.model_bio_pic img');
|
||||||
|
|
||||||
const profile = {
|
const profile = {
|
||||||
name: actorName,
|
name: actorName,
|
||||||
};
|
};
|
||||||
|
|
||||||
const heightString = bio.match(/\d+ feet \d+ inches/);
|
const heightString = bio.match(/\d+ feet \d+ inches/);
|
||||||
const ageString = bio.match(/Age:\s*\d{2}/);
|
const ageString = bio.match(/Age:\s*(\d{2})/);
|
||||||
|
const birthDateString = bio.match(/Age:\s*(\w+ \d{1,2}, \d{4})/);
|
||||||
const measurementsString = bio.match(/\w+-\d+-\d+/);
|
const measurementsString = bio.match(/\w+-\d+-\d+/);
|
||||||
|
|
||||||
|
if (birthDateString) profile.birthdate = parseDate(birthDateString[1], 'MMMM D, YYYY');
|
||||||
|
if (ageString) profile.age = Number(ageString[1]);
|
||||||
|
|
||||||
if (heightString) profile.height = heightToCm(heightString[0]);
|
if (heightString) profile.height = heightToCm(heightString[0]);
|
||||||
if (ageString) profile.age = Number(ageString[0].match(/\d{2}/)[0]);
|
|
||||||
if (measurementsString) [profile.bust, profile.waist, profile.hip] = measurementsString[0].split('-');
|
if (measurementsString) {
|
||||||
|
const [bust, waist, hip] = measurementsString[0].split('-');
|
||||||
|
|
||||||
|
if (bust) profile.bust = bust;
|
||||||
|
if (waist) profile.waist = Number(waist);
|
||||||
|
if (hip) profile.hip = Number(hip);
|
||||||
|
}
|
||||||
|
|
||||||
if (avatarEl) {
|
if (avatarEl) {
|
||||||
const avatarSources = [
|
const avatarSources = [
|
||||||
avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_3x') + 9, avatarEl.innerHTML.indexOf('3x.jpg') + 6).trim(),
|
avatarEl.getAttribute('src0_3x'),
|
||||||
avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_2x') + 9, avatarEl.innerHTML.indexOf('2x.jpg') + 6).trim(),
|
avatarEl.getAttribute('src0_2x'),
|
||||||
avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_1x') + 9, avatarEl.innerHTML.indexOf('1x.jpg') + 6).trim(),
|
avatarEl.getAttribute('src0_1x'),
|
||||||
avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0') + 6, avatarEl.innerHTML.indexOf('set.jpg') + 7).trim(),
|
avatarEl.getAttribute('src0'),
|
||||||
avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src') + 5, avatarEl.innerHTML.indexOf('set.jpg') + 7).trim(),
|
avatarEl.getAttribute('src'),
|
||||||
].filter(Boolean);
|
].filter(Boolean);
|
||||||
|
|
||||||
if (avatarSources.length) profile.avatar = avatarSources;
|
if (avatarSources.length) profile.avatar = avatarSources;
|
||||||
|
@ -332,6 +342,8 @@ function scrapeProfile(html, url, actorName) {
|
||||||
|
|
||||||
profile.releases = Array.from(document.querySelectorAll('.category_listing_block .update_details > a:first-child'), el => el.href);
|
profile.releases = Array.from(document.querySelectorAll('.category_listing_block .update_details > a:first-child'), el => el.href);
|
||||||
|
|
||||||
|
console.log(profile);
|
||||||
|
|
||||||
return profile;
|
return profile;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
function capitalize(string, trim = true) {
|
function capitalize(string, trim = true) {
|
||||||
|
if (!string) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
const capitalized = string
|
const capitalized = string
|
||||||
.split(/\s/)
|
.split(/\s/)
|
||||||
.map(component => `${component.charAt(0).toUpperCase()}${component.slice(1)}`)
|
.map(component => `${component.charAt(0).toUpperCase()}${component.slice(1)}`)
|
||||||
|
|
|
@ -10,20 +10,22 @@ const knex = require('../knex');
|
||||||
|
|
||||||
async function init() {
|
async function init() {
|
||||||
const posters = await knex('actors')
|
const posters = await knex('actors')
|
||||||
.select('actors.name as actor_name', 'releases.title', 'releases.date', 'media.path', 'sites.name as site_name', 'networks.name as network_name')
|
.select('actors.name as actor_name', 'releases.title', 'releases.date', 'media.path', 'media.index', 'sites.name as site_name', 'networks.name as network_name')
|
||||||
.whereIn('actors.name', argv.actors)
|
.whereIn('actors.name', (argv.actors || []).concat(argv._))
|
||||||
.join('releases_actors', 'releases_actors.actor_id', 'actors.id')
|
.join('releases_actors', 'releases_actors.actor_id', 'actors.id')
|
||||||
.join('releases', 'releases_actors.release_id', 'releases.id')
|
.join('releases', 'releases_actors.release_id', 'releases.id')
|
||||||
.join('releases_posters', 'releases_posters.release_id', 'releases.id')
|
|
||||||
.join('sites', 'sites.id', 'releases.site_id')
|
.join('sites', 'sites.id', 'releases.site_id')
|
||||||
.join('networks', 'networks.id', 'sites.network_id')
|
.join('networks', 'networks.id', 'sites.network_id')
|
||||||
|
.join('releases_posters', 'releases_posters.release_id', 'releases.id')
|
||||||
.join('media', 'releases_posters.media_id', 'media.id');
|
.join('media', 'releases_posters.media_id', 'media.id');
|
||||||
|
// .join('releases_photos', 'releases_photos.release_id', 'releases.id')
|
||||||
|
// .join('media', 'releases_photos.media_id', 'media.id');
|
||||||
|
|
||||||
await Promise.all(posters.map(async (poster) => {
|
await Promise.all(posters.map(async (poster) => {
|
||||||
const source = path.join(config.media.path, poster.path);
|
const source = path.join(config.media.path, poster.path);
|
||||||
|
|
||||||
const directory = path.join(config.media.path, 'extracted', poster.actor_name);
|
const directory = path.join(config.media.path, 'extracted', poster.actor_name);
|
||||||
const target = path.join(directory, `${poster.actor_name} - ${poster.network_name}: ${poster.site_name} - ${poster.title.replace(/[/.]/g, '_')} (${moment.utc(poster.date).format('YYYY-MM-DD')}).jpeg`);
|
const target = path.join(directory, `${poster.actor_name} - ${poster.network_name}: ${poster.site_name} - ${poster.title.replace(/[/.]/g, '_')} (${moment.utc(poster.date).format('YYYY-MM-DD')})-${poster.index}.jpeg`);
|
||||||
await fs.mkdir(path.join(directory), { recursive: true });
|
await fs.mkdir(path.join(directory), { recursive: true });
|
||||||
|
|
||||||
const file = await fs.readFile(source);
|
const file = await fs.readFile(source);
|
||||||
|
|
|
@ -334,11 +334,10 @@ module.exports = {
|
||||||
ex: extract,
|
ex: extract,
|
||||||
exa: extractAll,
|
exa: extractAll,
|
||||||
fd: formatDate,
|
fd: formatDate,
|
||||||
|
parseDate: extractDate,
|
||||||
ctx: init,
|
ctx: init,
|
||||||
ctxa: initAll,
|
ctxa: initAll,
|
||||||
geta: getAll,
|
geta: getAll,
|
||||||
edate: extractDate,
|
|
||||||
fdate: formatDate,
|
|
||||||
qu: quFuncs,
|
qu: quFuncs,
|
||||||
...legacyFuncs,
|
...legacyFuncs,
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue