forked from DebaucheryLibrarian/traxxx
Added Dogfart scraper. Added 'date added' property to release page.
This commit is contained in:
@@ -30,6 +30,11 @@ const { argv } = yargs
|
||||
type: 'string',
|
||||
default: config.fetchAfter.join(' '),
|
||||
})
|
||||
.option('pages', {
|
||||
describe: 'Limit pages to scrape per site. Only used when no dates are found or --after is unset.',
|
||||
type: 'number',
|
||||
default: 1,
|
||||
})
|
||||
.option('save', {
|
||||
describe: 'Save fetched releases to database',
|
||||
type: 'boolean',
|
||||
|
||||
@@ -44,6 +44,7 @@ function curateSites(sites) {
|
||||
id: site.network_id,
|
||||
name: site.network_name,
|
||||
slug: site.network_slug,
|
||||
parameters: JSON.parse(site.network_parameters),
|
||||
},
|
||||
parameters: JSON.parse(site.parameters),
|
||||
}));
|
||||
@@ -55,7 +56,7 @@ async function accumulateIncludedSites() {
|
||||
const networkIds = networks.map(network => network.id);
|
||||
|
||||
const rawSites = await knex('sites')
|
||||
.select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug')
|
||||
.select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.parameters as network_parameters')
|
||||
.whereIn('sites.slug', argv.sites || [])
|
||||
.orWhereIn('network_id', networkIds)
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id');
|
||||
@@ -269,7 +270,7 @@ async function fetchNewReleases(scraper, site, afterDate, accReleases = [], page
|
||||
|
||||
const oldestReleaseOnPage = latestReleases.slice(-1)[0].date;
|
||||
|
||||
if (uniqueReleases.length > 0 && moment(oldestReleaseOnPage).isAfter(afterDate)) {
|
||||
if (uniqueReleases.length > 0 && moment(oldestReleaseOnPage).isAfter(afterDate) && (!oldestReleaseOnPage && page < argv.pages)) {
|
||||
return fetchNewReleases(scraper, site, afterDate, accReleases.concat(uniqueReleases), page + 1);
|
||||
}
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ async function findSite(url) {
|
||||
const domain = hostname.replace(/^www./, '');
|
||||
|
||||
const site = await knex('sites')
|
||||
.select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug')
|
||||
.select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.parameters as network_parameters')
|
||||
.where('sites.url', 'like', `%${domain}`)
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.first()
|
||||
@@ -30,6 +30,7 @@ async function findSite(url) {
|
||||
network: {
|
||||
id: site.network_id || site.id,
|
||||
slug: site.network_slug || site.slug,
|
||||
parameters: site.network_parameters && JSON.parse(site.network_parameters),
|
||||
},
|
||||
parameters: site.parameters && JSON.parse(site.parameters),
|
||||
isFallback: site.network_id === undefined,
|
||||
@@ -94,8 +95,8 @@ async function storeRelease(release) {
|
||||
return release;
|
||||
}
|
||||
|
||||
async function fetchScene(url) {
|
||||
const site = await findSite(url);
|
||||
async function fetchScene(url, release) {
|
||||
const site = release.site || await findSite(url);
|
||||
const scraper = scrapers[site.slug] || scrapers[site.network.slug];
|
||||
|
||||
if (!scraper) {
|
||||
|
||||
@@ -21,6 +21,7 @@ async function curateRelease(release) {
|
||||
id: release.id,
|
||||
title: release.title,
|
||||
date: release.date,
|
||||
dateAdded: release.created_at,
|
||||
description: release.description,
|
||||
url: release.url,
|
||||
shootId: release.shoot_id,
|
||||
@@ -73,7 +74,7 @@ async function fetchReleases(releaseId) {
|
||||
.leftJoin('sites', 'releases.site_id', 'sites.id')
|
||||
.leftJoin('studios', 'releases.studio_id', 'studios.id')
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.orderBy('date', 'desc')
|
||||
.orderBy([{ column: 'date', order: 'desc' }, { column: 'created_at', order: 'desc' }])
|
||||
.limit(100);
|
||||
|
||||
return curateReleases(releases);
|
||||
|
||||
@@ -91,7 +91,7 @@ async function scrapeScene(html, url, site) {
|
||||
const [channelSite, tags] = await Promise.all([
|
||||
site.isFallback
|
||||
? knex('sites')
|
||||
.where({ id: siteId })
|
||||
.where({ slug: siteId })
|
||||
.orWhereRaw('name = ? collate NOCASE', [siteName])
|
||||
.first()
|
||||
: site,
|
||||
|
||||
@@ -69,10 +69,12 @@ async function scrapeScene(html, url, site) {
|
||||
const rawTags = $('.vdoTags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
|
||||
|
||||
const [channelSite, tags] = await Promise.all([
|
||||
knex('sites')
|
||||
.where({ slug: siteId })
|
||||
.orWhere({ name: siteName })
|
||||
.first(),
|
||||
site.isFallback
|
||||
? knex('sites')
|
||||
.where({ slug: siteId })
|
||||
.orWhere({ name: siteName })
|
||||
.first()
|
||||
: site,
|
||||
matchTags(rawTags),
|
||||
]);
|
||||
|
||||
|
||||
170
src/scrapers/dogfart.js
Normal file
170
src/scrapers/dogfart.js
Normal file
@@ -0,0 +1,170 @@
|
||||
'use strict';
|
||||
|
||||
/* eslint-disable newline-per-chained-call */
|
||||
const Promise = require('bluebird');
|
||||
const bhttp = require('bhttp');
|
||||
const { JSDOM } = require('jsdom');
|
||||
const moment = require('moment');
|
||||
const knex = require('knex');
|
||||
|
||||
const { matchTags } = require('../tags');
|
||||
|
||||
async function getPhoto(url) {
|
||||
const res = await bhttp.get(url);
|
||||
const html = res.body.toString();
|
||||
const { document } = new JSDOM(html).window;
|
||||
|
||||
const photoUrl = document.querySelector('.scenes-module img').src;
|
||||
|
||||
return photoUrl;
|
||||
}
|
||||
|
||||
async function getPhotos(albumUrl, site, siteUrl) {
|
||||
const res = await bhttp.get(albumUrl);
|
||||
const html = res.body.toString();
|
||||
const { document } = new JSDOM(html).window;
|
||||
|
||||
const lastPhotoPage = Array.from(document.querySelectorAll('.preview-image-container a')).slice(-1)[0].href;
|
||||
const lastPhotoIndex = parseInt(lastPhotoPage.match(/\d+.jpg/)[0], 10);
|
||||
|
||||
// dogfart has massive albums, pick 20 or specified photos: first, last and evenly inbetween
|
||||
const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) || 25;
|
||||
const photoIndexes = [1]
|
||||
.concat(Array.from({ length: photoLimit - 2 }, (value, index) => Math.floor((index + 1) * (lastPhotoIndex / (photoLimit - 2)))))
|
||||
.concat(lastPhotoIndex);
|
||||
|
||||
if (photoLimit > 25) {
|
||||
console.log(`${site.name}: Scraping ${photoLimit} album photos from ${siteUrl}, this may take some time...`);
|
||||
}
|
||||
|
||||
const photoUrls = await Promise.map(photoIndexes, async (index) => {
|
||||
const pageUrl = `https://blacksonblondes.com${lastPhotoPage.replace(/\d+.jpg/, `${index.toString().padStart(3, '0')}.jpg`)}`;
|
||||
|
||||
return getPhoto(pageUrl);
|
||||
}, {
|
||||
concurrency: 5,
|
||||
});
|
||||
|
||||
return photoUrls;
|
||||
}
|
||||
|
||||
function scrapeLatest(html, site) {
|
||||
const { document } = new JSDOM(html).window;
|
||||
const sceneElements = Array.from(document.querySelectorAll('.recent-updates'));
|
||||
|
||||
return sceneElements.reduce((acc, element) => {
|
||||
const siteUrl = element.querySelector('.help-block').textContent;
|
||||
|
||||
if (siteUrl.toLowerCase() !== new URL(site.url).host) {
|
||||
// different dogfart site
|
||||
return acc;
|
||||
}
|
||||
|
||||
const sceneLinkElement = element.querySelector('.thumbnail');
|
||||
const url = `https://dogfartnetwork.com${sceneLinkElement.href}`;
|
||||
const { pathname } = new URL(url);
|
||||
const entryId = `${site.slug}_${pathname.split('/')[4]}`;
|
||||
|
||||
const title = element.querySelector('.scene-title').textContent;
|
||||
const actors = title.split(/[,&]|\band\b/).map(actor => actor.trim());
|
||||
|
||||
const poster = `https:${element.querySelector('img').src}`;
|
||||
const trailer = sceneLinkElement.dataset.preview_clip_url;
|
||||
|
||||
return [
|
||||
...acc,
|
||||
{
|
||||
url,
|
||||
entryId,
|
||||
title,
|
||||
actors,
|
||||
poster,
|
||||
trailer: {
|
||||
src: trailer,
|
||||
},
|
||||
site,
|
||||
},
|
||||
];
|
||||
}, []);
|
||||
}
|
||||
|
||||
async function scrapeScene(html, url, site) {
|
||||
const { document } = new JSDOM(html).window;
|
||||
|
||||
const title = document.querySelector('.description-title').textContent;
|
||||
const actors = Array.from(document.querySelectorAll('.more-scenes a')).map(({ textContent }) => textContent);
|
||||
const metaDescription = document.querySelector('meta[itemprop="description"]').content;
|
||||
const description = metaDescription
|
||||
? metaDescription.content
|
||||
: document.querySelector('.description')
|
||||
.textContent
|
||||
.replace(/[ \t\n]{2,}/g, ' ')
|
||||
.replace('...read more', '')
|
||||
.trim();
|
||||
|
||||
const siteSlug = document.querySelector('.site-name').textContent.split('.')[0].toLowerCase();
|
||||
const date = new Date(document.querySelector('meta[itemprop="uploadDate"]').content);
|
||||
const duration = moment
|
||||
.duration(document
|
||||
.querySelectorAll('.extra-info p')[1]
|
||||
.textContent
|
||||
.match(/\d+:\d+$/)[0])
|
||||
.asSeconds();
|
||||
|
||||
const trailerElement = document.querySelector('.html5-video');
|
||||
const poster = `https:${trailerElement.dataset.poster}`;
|
||||
const { trailer } = trailerElement.dataset;
|
||||
|
||||
const lastPhotosUrl = Array.from(document.querySelectorAll('.pagination a')).slice(-1)[0].href;
|
||||
const { origin, pathname } = new URL(url);
|
||||
const photos = await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, site, url);
|
||||
|
||||
const stars = Number(document.querySelector('span[itemprop="average"]').textContent) / 2;
|
||||
const rawTags = Array.from(document.querySelectorAll('.scene-details .categories a')).map(({ textContent }) => textContent);
|
||||
|
||||
const [channelSite, tags] = await Promise.all([
|
||||
site.isFallback
|
||||
? knex('sites')
|
||||
.where({ slug: siteSlug })
|
||||
.orWhere({ url: `https://${siteSlug}.com` })
|
||||
.first()
|
||||
: site,
|
||||
matchTags(rawTags),
|
||||
]);
|
||||
|
||||
return {
|
||||
url,
|
||||
title,
|
||||
description,
|
||||
actors,
|
||||
date,
|
||||
duration,
|
||||
poster,
|
||||
photos,
|
||||
trailer: {
|
||||
src: trailer,
|
||||
},
|
||||
tags,
|
||||
rating: {
|
||||
stars,
|
||||
},
|
||||
site: channelSite || site,
|
||||
};
|
||||
}
|
||||
|
||||
async function fetchLatest(site, page = 1) {
|
||||
const res = await bhttp.get(`https://dogfartnetwork.com/tour/scenes/?p=${page}`);
|
||||
|
||||
return scrapeLatest(res.body.toString(), site);
|
||||
}
|
||||
|
||||
async function fetchScene(url, site) {
|
||||
const res = await bhttp.get(url);
|
||||
|
||||
return scrapeScene(res.body.toString(), url, site);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchScene,
|
||||
};
|
||||
@@ -5,6 +5,7 @@ const bangbros = require('./bangbros');
|
||||
const blowpass = require('./blowpass');
|
||||
const brazzers = require('./brazzers');
|
||||
const ddfnetwork = require('./ddfnetwork');
|
||||
const dogfart = require('./dogfart');
|
||||
const evilangel = require('./evilangel');
|
||||
const julesjordan = require('./julesjordan');
|
||||
const kink = require('./kink');
|
||||
@@ -23,6 +24,8 @@ module.exports = {
|
||||
blowpass,
|
||||
brazzers,
|
||||
ddfnetwork,
|
||||
dogfart,
|
||||
dogfartnetwork: dogfart,
|
||||
evilangel,
|
||||
julesjordan,
|
||||
kink,
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
const bhttp = require('bhttp');
|
||||
const cheerio = require('cheerio');
|
||||
const moment = require('moment');
|
||||
const knex = require('knex');
|
||||
|
||||
const knex = require('../knex');
|
||||
const { matchTags } = require('../tags');
|
||||
|
||||
function scrapeLatest(html, site) {
|
||||
@@ -75,7 +75,9 @@ async function scrapeScene(html, url, shootId, ratingRes, site) {
|
||||
const rawTags = $('.tag-list > a[href*="/tag"]').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
|
||||
|
||||
const [channelSite, tags] = await Promise.all([
|
||||
knex('sites').where({ slug: sitename }).first(),
|
||||
site.isFallback
|
||||
? knex('sites').where({ slug: sitename }).first()
|
||||
: site,
|
||||
matchTags(rawTags),
|
||||
]);
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ async function scrapeScene(html, url, site) {
|
||||
|
||||
const [channelSite, tags] = await Promise.all([
|
||||
knex('sites')
|
||||
.where({ id: siteId })
|
||||
.where({ slug: siteId })
|
||||
.orWhere({ url: `https://www.mofos.com${siteUrl}` })
|
||||
.orWhere({ name: sitename })
|
||||
.first(),
|
||||
|
||||
@@ -70,7 +70,7 @@ async function scrapeScene(html, url, site) {
|
||||
|
||||
const [channelSite, tags] = await Promise.all([
|
||||
knex('sites')
|
||||
.where({ id: siteId })
|
||||
.where({ slug: siteId })
|
||||
.orWhere({ name: siteName })
|
||||
.first(),
|
||||
matchTags(rawTags),
|
||||
|
||||
@@ -13,6 +13,8 @@ function scrapeLatest(html, site) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
const sceneElements = $('.card.card--release').toArray();
|
||||
|
||||
console.log(sceneElements);
|
||||
|
||||
return sceneElements.map((element) => {
|
||||
const sceneLinkElement = $(element).find('.card-info__title a');
|
||||
const title = sceneLinkElement.attr('title');
|
||||
@@ -22,6 +24,8 @@ function scrapeLatest(html, site) {
|
||||
const date = moment.utc($(element).find('.card-info__meta-date').text(), 'MMMM DD, YYYY').toDate();
|
||||
const actors = $(element).find('.card-info__cast a').map((actorIndex, actorElement) => $(actorElement).text().trim()).toArray();
|
||||
|
||||
console.log(date, actors, title);
|
||||
|
||||
return {
|
||||
url,
|
||||
entryId,
|
||||
@@ -54,6 +58,8 @@ async function scrapeScene(data, url, site) {
|
||||
const { likes, dislikes } = data.stats;
|
||||
const duration = data.videos.mediabook.length;
|
||||
|
||||
console.log(data);
|
||||
|
||||
const rawTags = data.tags.map(tag => tag.name);
|
||||
const tags = await matchTags(rawTags);
|
||||
|
||||
|
||||
@@ -3,9 +3,9 @@
|
||||
const Promise = require('bluebird');
|
||||
const bhttp = require('bhttp');
|
||||
const cheerio = require('cheerio');
|
||||
const knex = require('knex');
|
||||
const moment = require('moment');
|
||||
|
||||
const knex = require('../knex');
|
||||
const { matchTags } = require('../tags');
|
||||
|
||||
async function fetchPhotos(url) {
|
||||
@@ -126,7 +126,6 @@ async function scrapeScene(html, url, site) {
|
||||
|
||||
const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();
|
||||
|
||||
const rawTags = data.keywords.split(', ');
|
||||
const siteDomain = $('meta[name="twitter:domain"]').attr('content');
|
||||
const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase();
|
||||
const siteUrl = siteDomain && `https://www.${siteDomain}`;
|
||||
@@ -136,11 +135,13 @@ async function scrapeScene(html, url, site) {
|
||||
|
||||
const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain);
|
||||
|
||||
const rawTags = data.keywords.split(', ');
|
||||
|
||||
const [channelSite, tags] = await Promise.all([
|
||||
site.isFallback
|
||||
? knex('sites')
|
||||
.where({ url: siteUrl })
|
||||
.orWhere({ id: siteId })
|
||||
.orWhere({ slug: siteId })
|
||||
.first()
|
||||
: site,
|
||||
matchTags(rawTags),
|
||||
|
||||
Reference in New Issue
Block a user