Tweaked Spizoo scraper for Goth Girlfriends.

This commit is contained in:
DebaucheryLibrarian
2024-09-13 01:22:46 +02:00
parent b41317706f
commit d3f15a6a2b
4 changed files with 106 additions and 18 deletions

View File

@@ -1,5 +1,8 @@
'use strict';
const unprint = require('unprint');
const format = require('template-format');
const qu = require('../utils/qu');
const slugify = require('../utils/slugify');
@@ -14,17 +17,19 @@ function scrapeAll(scenes) {
release.url = query.url('a');
release.entryId = getEntryId(release.url);
release.title = query.cnt('.title-label a, .thumb-title a, .p-7, .text h3');
release.title = query.content('.title-label a, .thumb-title a, .p-7, .text h3');
release.date = query.date('.date-label', 'MM/DD/YYYY');
release.actors = query.all(['.update_models a', '.tour_update_models a', '.pornstar-label span']).map((el) => ({
name: query.cnt(el),
url: query.url(el, null),
name: unprint.query.content(el),
url: unprint.query.url(el, null),
}));
release.poster = query.img('a img');
release.teaser = query.video('.leVideo source');
console.log(release);
return release;
});
}
@@ -47,7 +52,7 @@ function scrapeScene({ query }, url) {
release.tags = query.cnts('.categories-holder a, #sceneInfo a[href*="/categories"], #trailer-data a[href*="/categories"]');
const poster = query.img(['#video-holder .update_thumb', '#noMore .update_thumb', '#hpromo .update_thumb', '.trailer-thumb']) || query.poster('#trailervideo');
const poster = query.img(['#video-holder .update_thumb', '#video-holder .thumb', '#noMore .update_thumb', '#hpromo .update_thumb', '.trailer-thumb']) || query.poster('#trailervideo');
const posterPathname = poster && new URL(poster)?.pathname;
release.poster = [poster, poster?.replace(/imgw=\w+/, 'imgw=680')];
@@ -62,6 +67,8 @@ function scrapeScene({ query }, url) {
release.trailer = query.video('#trailervideo source[type="video/mp4"], #FulsSizeVideo source[type="video/mp4"]'); // sic
release.teaser = query.video('#trailer-video source[src*="/videothumbs"]');
console.log(release);
return release;
}
@@ -131,10 +138,14 @@ function scrapeProfile({ query, el }) {
}
async function fetchLatest(channel, page) {
const res = await qu.getAll(`${channel.url}/categories/movies_${page}_d.html`, '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail');
// const res = await qu.getAll(`${channel.url}/categories/movies_${page}_d.html`, '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail');
const res = await unprint.get(`${channel.url}${format(channel.parameters?.latest || '/categories/movies_{page}_d.html', { page })}`, {
selectAll: '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail',
});
if (res.ok) {
return scrapeAll(res.items, channel);
return scrapeAll(res.context, channel);
}
return res.status;

View File

@@ -71,7 +71,11 @@ async function findSiteByUrl(url) {
.leftJoin('networks', 'sites.network_id', 'networks.id')
.select(
'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
'networks.name as network_name',
'networks.slug as network_slug',
'networks.url as network_url',
'networks.description as network_description',
'networks.parameters as network_parameters',
)
.where('sites.url', url)
.orWhere('sites.url', origin)
@@ -114,7 +118,11 @@ async function fetchSitesFromArgv() {
const rawSites = await knex('sites')
.select(
'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
'networks.name as network_name',
'networks.slug as network_slug',
'networks.url as network_url',
'networks.description as network_description',
'networks.parameters as network_parameters',
)
.whereIn('sites.slug', argv.sites || [])
.orWhereIn('networks.slug', argv.networks || [])
@@ -133,7 +141,11 @@ async function fetchSitesFromConfig() {
const rawSites = await knex('sites')
.select(
'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
'networks.name as network_name',
'networks.slug as network_slug',
'networks.url as network_url',
'networks.description as network_description',
'networks.parameters as network_parameters',
)
.leftJoin('networks', 'sites.network_id', 'networks.id')
.where((builder) => {
@@ -168,7 +180,11 @@ async function fetchSites(queryObject) {
.where((builder) => whereOr(queryObject, 'sites', builder))
.select(
'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
'networks.name as network_name',
'networks.slug as network_slug',
'networks.url as network_url',
'networks.description as network_description',
'networks.parameters as network_parameters',
)
.leftJoin('networks', 'sites.network_id', 'networks.id')
.limit(100);