Refactored 21sextury scraper.

This commit is contained in:
2019-12-09 05:00:49 +01:00
parent d874c508de
commit 04a89efa58
52 changed files with 2621 additions and 2068 deletions

View File

@@ -8,7 +8,6 @@ const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags');
const pluckPhotos = require('../utils/pluck-photos');
async function getPhoto(url) {
const res = await bhttp.get(url);
@@ -20,7 +19,7 @@ async function getPhoto(url) {
return photoUrl;
}
async function getPhotos(albumUrl, site, siteUrl) {
async function getPhotos(albumUrl) {
const res = await bhttp.get(albumUrl);
const html = res.body.toString();
const { document } = new JSDOM(html).window;
@@ -28,15 +27,7 @@ async function getPhotos(albumUrl, site, siteUrl) {
const lastPhotoPage = Array.from(document.querySelectorAll('.preview-image-container a')).slice(-1)[0].href;
const lastPhotoIndex = parseInt(lastPhotoPage.match(/\d+.jpg/)[0], 10);
// dogfart has massive albums, pick 25 or specified number of photos: first, last and evenly inbetween
const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) || 25;
const photoIndexes = pluckPhotos(lastPhotoIndex, photoLimit);
if (photoLimit > 25) {
console.log(`${site.name}: Scraping ${photoLimit} album photos from ${siteUrl}, this may take some time...`);
}
const photoUrls = await Promise.map(photoIndexes, async (index) => {
const photoUrls = await Promise.map(Array.from({ length: lastPhotoIndex }), async (index) => {
const pageUrl = `https://blacksonblondes.com${lastPhotoPage.replace(/\d+.jpg/, `${index.toString().padStart(3, '0')}.jpg`)}`;
return getPhoto(pageUrl);