Generating and using URL slugs for releases, improver slugify module. Added 'extract' parameter to MindGeek scraper to get scenes not associate with a channel (see Digital Playground). Added various high res logos.
This commit is contained in:
@@ -367,6 +367,7 @@ async function scrapeActors(actorNames) {
|
||||
} catch (error) {
|
||||
if (error.warn !== false) {
|
||||
logger.warn(`Error in scraper ${source}: ${error.message}`);
|
||||
logger.error(error.stack);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ const { argv } = yargs
|
||||
describe: 'Scrape profiles for new actors after fetching scenes',
|
||||
type: 'boolean',
|
||||
alias: 'with-actors',
|
||||
default: true,
|
||||
default: false,
|
||||
})
|
||||
.option('scene', {
|
||||
describe: 'Scrape scene info from URL',
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
const moment = require('moment');
|
||||
|
||||
@@ -16,6 +17,7 @@ const {
|
||||
storeTrailer,
|
||||
} = require('./media');
|
||||
const { fetchSites, findSiteByUrl } = require('./sites');
|
||||
const slugify = require('./utils/slugify');
|
||||
|
||||
function commonQuery(queryBuilder, {
|
||||
filter = [],
|
||||
@@ -204,6 +206,11 @@ async function attachStudio(release) {
|
||||
}
|
||||
|
||||
async function curateReleaseEntry(release) {
|
||||
const slug = slugify(release.title, {
|
||||
encode: true,
|
||||
limit: config.titleSlugLength,
|
||||
});
|
||||
|
||||
const curatedRelease = {
|
||||
site_id: release.site.id,
|
||||
studio_id: release.studio ? release.studio.id : null,
|
||||
@@ -213,6 +220,7 @@ async function curateReleaseEntry(release) {
|
||||
type: release.type,
|
||||
url: release.url,
|
||||
title: release.title,
|
||||
slug,
|
||||
date: release.date,
|
||||
description: release.description,
|
||||
// director: release.director,
|
||||
@@ -397,7 +405,7 @@ async function storeRelease(release) {
|
||||
|
||||
logger.info(`Stored release "${release.title}" (${releaseEntry.id}, ${release.site.name})`);
|
||||
|
||||
return releaseEntry.id;
|
||||
return releaseEntry;
|
||||
}
|
||||
|
||||
async function storeReleases(releases) {
|
||||
@@ -405,10 +413,11 @@ async function storeReleases(releases) {
|
||||
try {
|
||||
const releaseWithChannelSite = await attachChannelSite(release);
|
||||
const releaseWithStudio = await attachStudio(releaseWithChannelSite);
|
||||
const releaseId = await storeRelease(releaseWithStudio);
|
||||
const { id, slug } = await storeRelease(releaseWithStudio);
|
||||
|
||||
return {
|
||||
id: releaseId,
|
||||
id,
|
||||
slug,
|
||||
...releaseWithChannelSite,
|
||||
};
|
||||
} catch (error) {
|
||||
|
||||
@@ -97,7 +97,7 @@ async function scrapeReleases(sources, release = null, type = 'scene') {
|
||||
const { releases: storedReleases } = await storeReleases(curatedReleases);
|
||||
|
||||
if (storedReleases) {
|
||||
console.log(storedReleases.map(storedRelease => `http://${config.web.host}:${config.web.port}/scene/${storedRelease.id}`).join('\n'));
|
||||
logger.info(storedReleases.map(storedRelease => `\nhttp://${config.web.host}:${config.web.port}/scene/${storedRelease.id}/${storedRelease.slug}`).join(''));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,7 +58,7 @@ async function scrapeScene(scene, site, tokens) {
|
||||
},
|
||||
};
|
||||
|
||||
release.url = `${site.url}/scene/${release.entryId}/${slugify(release.title, true)}`;
|
||||
release.url = `${site.url}/scene/${release.entryId}/${slugify(release.title, { encode: true })}`;
|
||||
release.date = new Date(scene.sites.collection[scene.id].publishDate);
|
||||
release.poster = scene._resources.primary[0].url;
|
||||
|
||||
|
||||
@@ -26,6 +26,16 @@ function getThumbs(scene) {
|
||||
}
|
||||
|
||||
function scrapeLatestX(data, site) {
|
||||
if (site.parameters?.extract === true && data.collections.length > 0) {
|
||||
// release should not belong to any channel
|
||||
return null;
|
||||
}
|
||||
|
||||
if (typeof site.parameters?.extract === 'string' && !data.collections.some(collection => collection.shortName === site.parameters.extract)) {
|
||||
// release should belong to specific channel
|
||||
return null;
|
||||
}
|
||||
|
||||
const { id: entryId, title, description } = data;
|
||||
const hostname = site.parameters?.native ? site.url : site.network.url;
|
||||
const url = `${hostname}/scene/${entryId}/`;
|
||||
@@ -58,7 +68,9 @@ function scrapeLatestX(data, site) {
|
||||
}
|
||||
|
||||
async function scrapeLatest(items, site) {
|
||||
return Promise.all(items.map(async data => scrapeLatestX(data, site)));
|
||||
const latestReleases = await Promise.all(items.map(async data => scrapeLatestX(data, site)));
|
||||
|
||||
return latestReleases.filter(Boolean);
|
||||
}
|
||||
|
||||
function scrapeScene(data, url, _site) {
|
||||
@@ -85,10 +97,10 @@ function scrapeScene(data, url, _site) {
|
||||
};
|
||||
}
|
||||
|
||||
const siteName = data.collections[0].name;
|
||||
const siteName = data.collections[0]?.name || data.brand;
|
||||
release.channel = siteName.replace(/\s+/g, '').toLowerCase();
|
||||
|
||||
release.url = url || `https://www.realitykings.com/scene/${entryId}/`;
|
||||
release.url = url || `https://www.${data.brand}.com/scene/${entryId}/`;
|
||||
|
||||
return release;
|
||||
}
|
||||
@@ -104,6 +116,9 @@ function getUrl(site) {
|
||||
return `${site.url}/scenes`;
|
||||
}
|
||||
|
||||
if (site.parameters?.extract) {
|
||||
return `${site.url}/scenes`;
|
||||
}
|
||||
|
||||
if (site.parameters?.siteId) {
|
||||
return `${site.network.url}/scenes?site=${site.parameters.siteId}`;
|
||||
@@ -144,7 +159,7 @@ function scrapeProfile(data, html, releases = []) {
|
||||
if (data.height) profile.height = inchesToCm(data.height);
|
||||
if (data.weight) profile.weight = lbsToKg(data.weight);
|
||||
|
||||
if (data.images.card_main_rect && data.images.card_main_rect[0]) {
|
||||
if (data.images.card_main_rect?.[0]) {
|
||||
profile.avatar = data.images.card_main_rect[0].xl?.url
|
||||
|| data.images.card_main_rect[0].lg?.url
|
||||
|| data.images.card_main_rect[0].md?.url
|
||||
@@ -169,7 +184,7 @@ async function fetchLatest(site, page = 1) {
|
||||
|
||||
const beforeDate = moment().add('1', 'day').format('YYYY-MM-DD');
|
||||
const limit = 10;
|
||||
const apiUrl = site.parameters?.native
|
||||
const apiUrl = site.parameters?.native || site.parameters?.extract
|
||||
? `https://site-api.project1service.com/v2/releases?dateReleased=<${beforeDate}&limit=${limit}&offset=${limit * (page - 1)}&orderBy=-dateReleased&type=scene`
|
||||
: `https://site-api.project1service.com/v2/releases?collectionId=${siteId}&dateReleased=<${beforeDate}&limit=${limit}&offset=${limit * (page - 1)}&orderBy=-dateReleased&type=scene`;
|
||||
|
||||
|
||||
@@ -77,7 +77,7 @@ async function scrapeScene(html, url, site) {
|
||||
release.actors = qa('.value a[href*=models], .value a[href*=performer], .value a[href*=teen-babes]', true);
|
||||
|
||||
if (release.actors.length === 0) {
|
||||
const actorEl = qa('.stat').find(stat => /Featuring/.test(stat.textContent))
|
||||
const actorEl = qa('.stat').find(stat => /Featuring/.test(stat.textContent));
|
||||
const actorString = qtext(actorEl);
|
||||
|
||||
console.log(actorString);
|
||||
@@ -147,7 +147,7 @@ function scrapeProfile(html) {
|
||||
|
||||
const bio = qa('.stat').reduce((acc, el) => {
|
||||
const prop = q(el, '.label', true).slice(0, -1);
|
||||
const key = slugify(prop, false, '_');
|
||||
const key = slugify(prop, { delimiter: '_' });
|
||||
const value = q(el, '.value', true);
|
||||
|
||||
return {
|
||||
|
||||
@@ -60,7 +60,7 @@ function destructConfigNetworks(networks = []) {
|
||||
}
|
||||
|
||||
async function findSiteByUrl(url) {
|
||||
const { origin, pathname } = new URL(url);
|
||||
const { origin, hostname, pathname } = new URL(url);
|
||||
// const domain = hostname.replace(/www.|tour./, '');
|
||||
const dirUrl = `${origin}${pathname.split('/').slice(0, 2).join('/')}`; // allow for sites on URI directory
|
||||
|
||||
@@ -72,6 +72,9 @@ async function findSiteByUrl(url) {
|
||||
)
|
||||
.where('sites.url', url)
|
||||
.orWhere('sites.url', origin)
|
||||
.orWhere('sites.url', origin.replace(/www\.|tour\./, ''))
|
||||
.orWhere('sites.url', `https://www.${hostname}`)
|
||||
.orWhere('sites.url', `http://www.${hostname}`)
|
||||
.orWhere('sites.url', dirUrl)
|
||||
// .orWhere('sites.url', 'like', `%${domain}`)
|
||||
.first();
|
||||
|
||||
@@ -9,16 +9,18 @@ const knex = require('../knex');
|
||||
|
||||
async function init() {
|
||||
const posters = await knex('actors')
|
||||
.select('actors.name', 'releases.title', 'media.path')
|
||||
.whereIn('name', argv.actors)
|
||||
.select('actors.name as actor_name', 'releases.title', 'media.path', 'sites.name as site_name', 'networks.name as network_name')
|
||||
.whereIn('actors.name', argv.actors)
|
||||
.join('releases_actors', 'releases_actors.actor_id', 'actors.id')
|
||||
.join('releases', 'releases_actors.release_id', 'releases.id')
|
||||
.join('releases_posters', 'releases_posters.release_id', 'releases.id')
|
||||
.join('sites', 'sites.id', 'releases.site_id')
|
||||
.join('networks', 'networks.id', 'sites.network_id')
|
||||
.join('media', 'releases_posters.media_id', 'media.id');
|
||||
|
||||
await Promise.all(posters.map(async (poster) => {
|
||||
const source = path.join(config.media.path, poster.path);
|
||||
const target = path.join(config.media.path, 'posters', `${poster.title.replace('/', '_')}.${poster.name}.jpeg`);
|
||||
const target = path.join(config.media.path, 'posters', `${poster.actor_name} - ${poster.network_name}: ${poster.site_name} - ${poster.title.replace(/[/.]/g, '_')}.jpeg`);
|
||||
|
||||
const file = await fs.readFile(source);
|
||||
await fs.writeFile(target, file);
|
||||
|
||||
@@ -1,7 +1,21 @@
|
||||
'use strict';
|
||||
|
||||
function slugify(string, encode = false, delimiter = '-') {
|
||||
const slug = string.trim().toLowerCase().match(/\w+/g).join(delimiter);
|
||||
function slugify(string, {
|
||||
encode = false,
|
||||
delimiter = '-',
|
||||
limit = 1000,
|
||||
} = {}) {
|
||||
const slugComponents = string.trim().toLowerCase().match(/\w+/g);
|
||||
|
||||
const slug = slugComponents.reduce((acc, component, index) => {
|
||||
const accSlug = `${acc}${index > 0 ? delimiter : ''}${component}`;
|
||||
|
||||
if (accSlug.length < limit) {
|
||||
return accSlug;
|
||||
}
|
||||
|
||||
return acc;
|
||||
}, '');
|
||||
|
||||
return encode ? encodeURI(slug) : slug;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user