forked from DebaucheryLibrarian/traxxx
Added sparse date mode. Fixed profile fetch error catching. Updated Kelly Madison scraper, using site IDs and fixed profile scraper.
This commit is contained in:
parent
ffea90b5e8
commit
23f76fd1be
|
@ -273,6 +273,7 @@ module.exports = {
|
||||||
'topwebmodels',
|
'topwebmodels',
|
||||||
'pascalssubsluts',
|
'pascalssubsluts',
|
||||||
'kellymadison',
|
'kellymadison',
|
||||||
|
'5kporn',
|
||||||
'private',
|
'private',
|
||||||
'bangbros',
|
'bangbros',
|
||||||
'hitzefrei',
|
'hitzefrei',
|
||||||
|
|
|
@ -5281,23 +5281,30 @@ const sites = [
|
||||||
name: 'Teen Fidelity',
|
name: 'Teen Fidelity',
|
||||||
alias: ['tf'],
|
alias: ['tf'],
|
||||||
url: 'https://www.teenfidelity.com',
|
url: 'https://www.teenfidelity.com',
|
||||||
description: 'Home of Kelly Madison and Ryan Madison',
|
|
||||||
parent: 'kellymadison',
|
parent: 'kellymadison',
|
||||||
|
parameters: {
|
||||||
|
siteId: 3,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
slug: 'pornfidelity',
|
slug: 'pornfidelity',
|
||||||
name: 'Porn Fidelity',
|
name: 'Porn Fidelity',
|
||||||
alias: ['pf'],
|
alias: ['pf'],
|
||||||
url: 'https://www.pornfidelity.com',
|
url: 'https://www.pornfidelity.com',
|
||||||
description: 'Home of Kelly Madison and Ryan Madison',
|
|
||||||
parent: 'kellymadison',
|
parent: 'kellymadison',
|
||||||
|
parameters: {
|
||||||
|
siteId: 2,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
slug: 'kellymadison',
|
slug: 'kellymadison',
|
||||||
name: 'Kelly Madison',
|
name: 'Kelly Madison',
|
||||||
url: 'https://www.pornfidelity.com',
|
url: 'https://www.kellymadison.com',
|
||||||
description: 'Home of Kelly Madison and Ryan Madison',
|
description: 'Home of Kelly Madison and Ryan Madison',
|
||||||
parent: 'kellymadison',
|
parent: 'kellymadison',
|
||||||
|
parameters: {
|
||||||
|
siteId: 1,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
slug: '5kporn',
|
slug: '5kporn',
|
||||||
|
@ -5305,6 +5312,10 @@ const sites = [
|
||||||
url: 'https://www.5kporn.com',
|
url: 'https://www.5kporn.com',
|
||||||
tags: ['5k'],
|
tags: ['5k'],
|
||||||
parent: 'kellymadison',
|
parent: 'kellymadison',
|
||||||
|
parameters: {
|
||||||
|
// IDs overlap with Fidelity sites
|
||||||
|
siteId: 1,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
slug: '5kteens',
|
slug: '5kteens',
|
||||||
|
@ -5312,6 +5323,9 @@ const sites = [
|
||||||
url: 'https://www.5kteens.com',
|
url: 'https://www.5kteens.com',
|
||||||
tags: ['5k'],
|
tags: ['5k'],
|
||||||
parent: 'kellymadison',
|
parent: 'kellymadison',
|
||||||
|
parameters: {
|
||||||
|
siteId: 2,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
// KILLERGRAM
|
// KILLERGRAM
|
||||||
{
|
{
|
||||||
|
|
|
@ -614,7 +614,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
|
||||||
const profiles = Promise.map(validSources, async (source) => {
|
const profiles = Promise.map(validSources, async (source) => {
|
||||||
try {
|
try {
|
||||||
// config may group sources to try until success
|
// config may group sources to try until success
|
||||||
return [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
|
return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
|
||||||
try {
|
try {
|
||||||
const entity = entitiesBySlug[scraperSlug] || null;
|
const entity = entitiesBySlug[scraperSlug] || null;
|
||||||
|
|
||||||
|
|
|
@ -108,6 +108,8 @@ async function startMemorySample(snapshotTriggers = []) {
|
||||||
|
|
||||||
async function init() {
|
async function init() {
|
||||||
try {
|
try {
|
||||||
|
await redis.connect();
|
||||||
|
|
||||||
if (argv.server) {
|
if (argv.server) {
|
||||||
await initServer();
|
await initServer();
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -174,6 +174,12 @@ const { argv } = yargs
|
||||||
default: config.upcomingMissingDateLimit,
|
default: config.upcomingMissingDateLimit,
|
||||||
alias: ['upcoming-null-date-limit'],
|
alias: ['upcoming-null-date-limit'],
|
||||||
})
|
})
|
||||||
|
.option('filter-sparse-dates', {
|
||||||
|
describe: 'If some but not all scenes have dates, filter out scenes without dates, instead of using missing date limit.',
|
||||||
|
type: 'boolean',
|
||||||
|
default: false,
|
||||||
|
alias: ['sparse'],
|
||||||
|
})
|
||||||
.option('page', {
|
.option('page', {
|
||||||
describe: 'Page to start scraping at',
|
describe: 'Page to start scraping at',
|
||||||
type: 'number',
|
type: 'number',
|
||||||
|
|
|
@ -10,6 +10,6 @@ const redisClient = redis.createClient({
|
||||||
socket: config.redis,
|
socket: config.redis,
|
||||||
});
|
});
|
||||||
|
|
||||||
redisClient.connect();
|
// redisClient.connect();
|
||||||
|
|
||||||
module.exports = redisClient;
|
module.exports = redisClient;
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
|
const unprint = require('unprint');
|
||||||
|
|
||||||
const slugify = require('../utils/slugify');
|
const slugify = require('../utils/slugify');
|
||||||
const qu = require('../utils/qu');
|
const qu = require('../utils/qu');
|
||||||
const http = require('../utils/http');
|
const http = require('../utils/http');
|
||||||
const { feetInchesToCm } = require('../utils/convert');
|
const { feetInchesToCm, femaleFeetUsToEu } = require('../utils/convert');
|
||||||
|
|
||||||
const siteMapByKey = {
|
const siteMapByKey = {
|
||||||
PF: 'pornfidelity',
|
PF: 'pornfidelity',
|
||||||
|
@ -16,14 +18,11 @@ const siteMapByKey = {
|
||||||
const siteMapBySlug = Object.entries(siteMapByKey).reduce((acc, [key, value]) => ({ ...acc, [value]: key }), {});
|
const siteMapBySlug = Object.entries(siteMapByKey).reduce((acc, [key, value]) => ({ ...acc, [value]: key }), {});
|
||||||
|
|
||||||
function scrapeLatest(scenes, site) {
|
function scrapeLatest(scenes, site) {
|
||||||
return scenes.reduce((acc, { query }) => {
|
return scenes.map(({ query }) => {
|
||||||
const release = {};
|
const release = {};
|
||||||
|
|
||||||
release.shootId = query.q('.card-meta .text-right, .row .text-right, .card-footer-item:last-child', true);
|
release.shootId = query.q('.card-meta .text-right, .row .text-right, .card-footer-item:last-child', true);
|
||||||
|
|
||||||
const siteId = release.shootId.match(/\d?\w{2}/)[0];
|
|
||||||
const siteSlug = siteMapByKey[siteId];
|
|
||||||
|
|
||||||
const { pathname } = new URL(query.url('h5 a, .ep-title a, .title a'));
|
const { pathname } = new URL(query.url('h5 a, .ep-title a, .title a'));
|
||||||
[release.entryId] = pathname.match(/\d+$/);
|
[release.entryId] = pathname.match(/\d+$/);
|
||||||
release.url = `${site.url}${pathname}`;
|
release.url = `${site.url}${pathname}`;
|
||||||
|
@ -47,15 +46,19 @@ function scrapeLatest(scenes, site) {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* using site ID, filter no longer needed
|
||||||
|
const siteId = release.shootId.match(/\d?\w{2}/)[0];
|
||||||
|
const siteSlug = siteMapByKey[siteId];
|
||||||
|
|
||||||
if (site.slug !== siteSlug) {
|
if (site.slug !== siteSlug) {
|
||||||
// using generic network overview, scene is not from the site we want
|
// using generic network overview, scene is not from the site we want
|
||||||
return { ...acc, unextracted: [...acc.unextracted, release] };
|
return { ...acc, unextracted: [...acc.unextracted, release] };
|
||||||
}
|
}
|
||||||
|
|
||||||
return { ...acc, scenes: [...acc.scenes, release] };
|
return { ...acc, scenes: [...acc.scenes, release] };
|
||||||
}, {
|
*/
|
||||||
scenes: [],
|
|
||||||
unextracted: [],
|
return release;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -114,34 +117,47 @@ async function scrapeScene({ query, html }, url, baseRelease, channel, session)
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(release);
|
|
||||||
|
|
||||||
return release;
|
return release;
|
||||||
}
|
}
|
||||||
|
|
||||||
function scrapeProfile({ query }) {
|
function scrapeProfile({ query }) {
|
||||||
const profile = {};
|
const profile = {};
|
||||||
|
|
||||||
const bioKeys = query.all('table.table td:nth-child(1)', true);
|
const bioKeys = query.contents('table.table td:nth-child(1), table.table th');
|
||||||
const bioValues = query.all('table.table td:nth-child(2)', true);
|
const bioValues = query.contents('table.table td:nth-child(2)');
|
||||||
const bio = bioKeys.reduce((acc, key, index) => ({ ...acc, [key.slice(0, -1)]: bioValues[index] }), {});
|
|
||||||
|
|
||||||
if (bio.Ethnicity) profile.ethnicity = bio.Ethnicity;
|
const bio = bioKeys.reduce((acc, key, index) => ({
|
||||||
if (bio.Measurements) [profile.bust, profile.waist, profile.hip] = bio.Measurements.split('-');
|
...acc,
|
||||||
if (bio.Birthplace) profile.birthPlace = bio.Birthplace;
|
[slugify(key, '_')]: bioValues[index],
|
||||||
|
}), {});
|
||||||
|
|
||||||
if (bio.Height) {
|
if (bio.ethnicity) profile.ethnicity = bio.ethnicity;
|
||||||
const [feet, inches] = bio.Height.match(/\d+/g);
|
if (bio.measurements) profile.measurements = bio.measurements;
|
||||||
|
if (bio.birthplace) profile.birthPlace = bio.birthplace;
|
||||||
|
if (bio.shoe_size) profile.foot = femaleFeetUsToEu(bio.shoe_size);
|
||||||
|
|
||||||
|
if (bio.height) {
|
||||||
|
const [feet, inches] = bio.height.match(/\d+/g);
|
||||||
profile.height = feetInchesToCm(feet, inches);
|
profile.height = feetInchesToCm(feet, inches);
|
||||||
}
|
}
|
||||||
|
|
||||||
profile.avatar = query.img('img[src*="model"]');
|
if (bio.birthday) {
|
||||||
|
const [month, day] = bio.birthday.split('/');
|
||||||
|
const birthday = new Date(Date.UTC(0, Number(month) - 1, Number(day)));
|
||||||
|
|
||||||
|
birthday.setUTCFullYear(0); // indicate birth year is unknown
|
||||||
|
|
||||||
|
profile.dateOfBirth = new Date(birthday);
|
||||||
|
}
|
||||||
|
|
||||||
|
profile.avatar = query.img('img[src*="model"][src*="headshot"]');
|
||||||
|
profile.photos = query.imgs('img[src*="model"][src*="thumb_image"], img[src*="model"][src*="bg_image"]');
|
||||||
|
|
||||||
return profile;
|
return profile;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchLatest(channel, page = 1) {
|
async function fetchLatest(channel, page = 1) {
|
||||||
const url = `${channel.url}/episodes/search?page=${page}`; // TLS issues with teenfidelity.com, same overview on all sites
|
const url = `${channel.url}/episodes/search?page=${page}&site=${channel.parameters.siteId || ''}`; // TLS issues with teenfidelity.com, same overview on all sites
|
||||||
const res = await http.get(url, {
|
const res = await http.get(url, {
|
||||||
headers: {
|
headers: {
|
||||||
'X-Requested-With': 'XMLHttpRequest',
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
|
@ -165,16 +181,17 @@ async function fetchScene(url, channel, baseRelease) {
|
||||||
return res.ok ? scrapeScene(res.item, url, baseRelease, channel, session) : res.status;
|
return res.ok ? scrapeScene(res.item, url, baseRelease, channel, session) : res.status;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchProfile({ name: actorName }) {
|
async function fetchProfile({ name: actorName }, { entity }) {
|
||||||
const actorSlug = slugify(actorName);
|
const actorSlug = slugify(actorName);
|
||||||
const res = await qu.get(`https://www.kellymadison.com/models/${actorSlug}`, null, {
|
|
||||||
|
const res = await unprint.get(`${entity.url}/models/${actorSlug}`, {
|
||||||
headers: {
|
headers: {
|
||||||
'X-Requested-With': 'XMLHttpRequest',
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
if (res.ok) {
|
if (res.ok) {
|
||||||
return scrapeProfile(res.item);
|
return scrapeProfile(res.context);
|
||||||
}
|
}
|
||||||
|
|
||||||
return res.status;
|
return res.status;
|
||||||
|
|
|
@ -265,6 +265,7 @@ const scrapers = {
|
||||||
julesjordan,
|
julesjordan,
|
||||||
karups,
|
karups,
|
||||||
kellymadison,
|
kellymadison,
|
||||||
|
'5kporn': kellymadison,
|
||||||
killergram,
|
killergram,
|
||||||
kink,
|
kink,
|
||||||
kinkmen: kink,
|
kinkmen: kink,
|
||||||
|
|
|
@ -147,10 +147,12 @@ async function scrapeReleases(scraper, entity, preData, isUpcoming) {
|
||||||
|
|
||||||
const releases = await scrapeReleasesPage(argv.page || 1, []);
|
const releases = await scrapeReleasesPage(argv.page || 1, []);
|
||||||
|
|
||||||
const hasDates = releases.every((release) => !!release.date);
|
const hasDates = argv.filterSparseDates
|
||||||
|
? releases.some((release) => !!release.date)
|
||||||
|
: releases.every((release) => !!release.date);
|
||||||
|
|
||||||
const limitedReleases = (argv.last && releases.slice(0, Math.max(argv.last, 0)))
|
const limitedReleases = (argv.last && releases.slice(0, Math.max(argv.last, 0)))
|
||||||
|| (hasDates && releases.filter((release) => moment(release.date).isAfter(argv.after)))
|
|| (hasDates && releases.filter((release) => release.date && moment(release.date).isAfter(argv.after)))
|
||||||
|| releases.slice(0, Math.max(isUpcoming ? argv.upcomingMissingDateLimit : argv.missingDateLimit, 0));
|
|| releases.slice(0, Math.max(isUpcoming ? argv.upcomingMissingDateLimit : argv.missingDateLimit, 0));
|
||||||
|
|
||||||
const { uniqueReleases, duplicateReleases } = argv.force
|
const { uniqueReleases, duplicateReleases } = argv.force
|
||||||
|
|
Loading…
Reference in New Issue