Added support for Family Strokes.

This commit is contained in:
2020-01-13 23:45:09 +01:00
parent 48b37a509e
commit 859cb7e1f3
58 changed files with 2130 additions and 33 deletions

View File

@@ -3,6 +3,7 @@
const util = require('util');
const winston = require('winston');
const args = require('./argv');
require('winston-daily-rotate-file');
const logger = winston.createLogger({
format: winston.format.combine(
@@ -19,6 +20,11 @@ const logger = winston.createLogger({
),
timestamp: true,
}),
new winston.transports.DailyRotateFile({
datePattern: 'YYYY-MM-DD',
filename: 'log/%DATE%.log',
level: 'silly',
}),
],
});

View File

@@ -320,7 +320,6 @@ async function storeReleaseAssets(releases) {
await createMediaDirectory('releases', subpath);
try {
// don't use Promise.all to prevent concurrency issues with duplicate detection
if (release.poster) {
await storePhotos([release.poster], {
@@ -346,9 +345,6 @@ async function storeReleaseAssets(releases) {
targetId: release.id,
subpath,
}, identifier);
} catch (error) {
console.log(release.url, error);
}
}, {
concurrency: 10,
});
@@ -409,7 +405,7 @@ async function storeReleases(releases) {
...releaseWithChannelSite,
};
} catch (error) {
logger.error(error);
logger.error(error.message);
return null;
}

View File

@@ -79,7 +79,7 @@ async function deepFetchReleases(baseReleases) {
deep: true,
};
} catch (error) {
logger.error(error);
logger.error(error.message);
return {
...release,

View File

@@ -6,15 +6,6 @@ const { JSDOM } = require('jsdom');
const cheerio = require('cheerio');
const moment = require('moment');
const { matchTags } = require('../tags');
const defaultTags = {
swallowed: ['blowjob', 'deepthroat', 'facefuck'],
trueanal: ['anal'],
allanal: ['anal', 'fmf'],
nympho: [],
};
const descriptionTags = {
'anal cream pie': 'anal creampie',
'ass to mouth': 'ass to mouth',
@@ -55,7 +46,7 @@ async function scrapeLatestA(html, site) {
const actors = Array.from(element.querySelectorAll('h4.models a'), actorElement => actorElement.textContent);
const durationString = element.querySelector('.total-time').textContent.trim();
// timestamp is somethines 00:00, sometimes 0:00:00
// timestamp is sometimes 00:00, sometimes 0:00:00
const duration = durationString.split(':').length === 3
? moment.duration(durationString).asSeconds()
: moment.duration(`00:${durationString}`).asSeconds();
@@ -70,7 +61,7 @@ async function scrapeLatestA(html, site) {
.map(photoUrl => photoUrl.slice(photoUrl.indexOf('http'), photoUrl.indexOf('.jpg') + 4));
const photos = [...primaryPhotos, ...secondaryPhotos];
const tags = await matchTags([...defaultTags[site.slug], ...deriveTagsFromDescription(description)]);
const tags = deriveTagsFromDescription(description);
const scene = {
url,
@@ -124,7 +115,7 @@ async function scrapeLatestB(html, site) {
.map(photoUrl => photoUrl.slice(photoUrl.indexOf('http'), photoUrl.indexOf('.jpg') + 4));
const photos = [...primaryPhotos, ...secondaryPhotos];
const tags = await matchTags([...defaultTags[site.slug], ...deriveTagsFromDescription(description)]);
const tags = deriveTagsFromDescription(description);
return {
url,
@@ -155,7 +146,7 @@ async function scrapeSceneA(html, url, site) {
const actors = Array.from(element.querySelectorAll('.models a'), actorElement => actorElement.textContent);
const durationString = element.querySelector('.total-time').textContent.trim();
// timestamp is somethines 00:00, sometimes 0:00:00
// timestamp is sometimes 00:00, sometimes 0:00:00
const duration = durationString.split(':').length === 3
? moment.duration(durationString).asSeconds()
: moment.duration(`00:${durationString}`).asSeconds();
@@ -163,7 +154,7 @@ async function scrapeSceneA(html, url, site) {
const { poster } = document.querySelector('.content-page-header video');
const { src, type } = document.querySelector('.content-page-header source');
const tags = await matchTags([...defaultTags[site.slug], ...deriveTagsFromDescription(description)]);
const tags = deriveTagsFromDescription(description);
return {
url,
@@ -204,7 +195,7 @@ async function scrapeSceneB(html, url, site) {
const { poster } = document.querySelector('.content-page-header-inner video');
const { src, type } = document.querySelector('.content-page-header-inner source');
const tags = await matchTags([...defaultTags[site.slug], ...deriveTagsFromDescription(description)]);
const tags = deriveTagsFromDescription(description);
const scene = {
url,

View File

@@ -30,7 +30,7 @@ async function scrapeProfile(html, _url, actorName) {
if (descriptionString) profile.description = descriptionString.textContent;
if (bio.Birthday) profile.birthdate = moment.utc(bio.Birthday, 'MMM D, YYYY').toDate();
if (bio.Birthday && !/-0001/.test(bio.Birthday)) profile.birthdate = moment.utc(bio.Birthday, 'MMM D, YYYY').toDate(); // birthyear sometimes -0001, see Spencer Bradley as of january 2020
if (bio.Born) profile.birthdate = moment.utc(bio.Born, 'YYYY-MM-DD').toDate();
profile.birthPlace = bio['Birth Place'] || bio.Birthplace;

View File

@@ -15,7 +15,7 @@ function extractTitle(pathname) {
function extractActors(str) {
return str
.split(/,|\band/)
.split(/,|\band\b/ig)
.filter(actor => !/\.{3}/.test(actor))
.map(actor => actor.trim())
.filter(actor => actor.length > 0);
@@ -81,7 +81,54 @@ function scrapeScene(html, site) {
return release;
}
async function fetchLatest(site, page = 1) {
function scrapeSceneA(html, site, sceneX, url) {
const scene = sceneX || new JSDOM(html).window.document;
const release = { site };
release.description = scene.querySelector('.scene-story').textContent.replace('...read more', '...').trim();
release.date = moment.utc(scene.querySelector('.scene-date').textContent, 'MM/DD/YYYY').toDate();
release.actors = Array.from(scene.querySelectorAll('.starring span'), el => extractActors(el.textContent)).flat();
const durationString = scene.querySelector('.time').textContent.trim();
const duration = ['00'].concat(durationString.split(':')).slice(-3).join(':'); // ensure hh:mm:ss
release.duration = moment.duration(duration).asSeconds();
if (sceneX) {
const titleEl = scene.querySelector(':scope > a');
release.url = titleEl.href;
release.entryId = titleEl.id;
release.title = titleEl.title;
const [poster, ...photos] = Array.from(scene.querySelectorAll('.scene img'), el => el.src);
release.poster = [poster.replace('bio_big', 'video'), poster];
release.photos = photos;
}
if (!sceneX) {
release.title = scene.querySelector('.title span').textContent;
release.url = url;
release.poster = scene.querySelector('video').poster;
release.photos = [release.poster.replace('video', 'bio_small'), release.poster.replace('video', 'bio_small2')];
}
const [, entryIdA, entryIdB] = new URL(release.url).pathname.split('/');
release.entryId = entryIdA === 'scenes' ? entryIdB : entryIdA;
return release;
}
function scrapeLatestA(html, site) {
const { document } = new JSDOM(html).window;
const scenes = Array.from(document.querySelectorAll('.scenewrapper'));
return scenes.map(scene => scrapeSceneA(null, site, scene));
}
async function fetchLatestTeamSkeet(site, page = 1) {
const url = `https://www.teamskeet.com/t1/updates/load?fltrs[site]=${site.parameters.id}&page=${page}&view=newest&fltrs[time]=ALL&order=DESC`;
const res = await bhttp.get(url);
@@ -92,10 +139,37 @@ async function fetchLatest(site, page = 1) {
return null;
}
async function fetchLatestA(site) {
const url = `${site.url}/scenes`;
const res = await bhttp.get(url);
if (res.statusCode === 200) {
return scrapeLatestA(res.body.toString(), site);
}
return null;
}
async function fetchLatest(site, page = 1) {
if (site.parameters.id) {
return fetchLatestTeamSkeet(site, page);
}
if (site.parameters.scraper === 'A') {
return fetchLatestA(site, page);
}
return null;
}
async function fetchScene(url, site) {
const session = bhttp.session(); // resolve redirects
const res = await session.get(url);
if (site.parameters.scraper === 'A') {
return scrapeSceneA(res.body.toString(), site, null, url);
}
return scrapeScene(res.body.toString(), site);
}

View File

@@ -8,6 +8,11 @@ const knex = require('./knex');
const whereOr = require('./utils/where-or');
async function curateSite(site, includeParameters = false) {
const tags = await knex('sites_tags')
.select('tags.*', 'sites_tags.inherit')
.where('site_id', site.id)
.join('tags', 'tags.id', 'sites_tags.tag_id');
const parameters = JSON.parse(site.parameters);
return {
@@ -16,6 +21,7 @@ async function curateSite(site, includeParameters = false) {
url: site.url,
description: site.description,
slug: site.slug,
tags,
independent: !!parameters && parameters.independent,
parameters: includeParameters ? parameters : null,
network: {
@@ -55,7 +61,7 @@ function destructConfigNetworks(networks) {
async function findSiteByUrl(url) {
const { hostname } = new URL(url);
const domain = hostname.replace(/^www./, '');
const domain = hostname.replace(/www.|tour./, '');
const site = await knex('sites')
.leftJoin('networks', 'sites.network_id', 'networks.id')

View File

@@ -42,6 +42,7 @@ async function matchTags(rawTags) {
const tagEntries = await knex('tags')
.pluck('aliases.id')
.whereIn('tags.name', tags)
.orWhereIn('tags.slug', tags)
.where(function where() {
this
.whereNull('tags.alias_for')
@@ -58,15 +59,20 @@ async function matchTags(rawTags) {
}
async function associateTags(release, releaseId) {
if (!release.tags || release.tags.length === 0) {
const siteTags = release.site.tags.filter(tag => tag.inherit === true).map(tag => tag.id);
const rawReleaseTags = release.tags || [];
const releaseTags = rawReleaseTags.some(tag => typeof tag === 'string')
? await matchTags(release.tags) // scraper returned raw tags
: rawReleaseTags; // tags already matched by (outdated) scraper
const tags = releaseTags.concat(siteTags);
if (tags.length === 0) {
logger.info(`No tags available for (${release.site.name}, ${releaseId}) "${release.title}"`);
return;
}
const tags = release.tags.some(tag => typeof tag === 'string')
? await matchTags(release.tags) // scraper returned raw tags
: release.tags; // tags already matched by (outdated) scraper
const associationEntries = await knex('releases_tags')
.where('release_id', releaseId)
.whereIn('tag_id', tags);