Tweaked Spizoo scraper for Goth Girlfriends.

This commit is contained in:
DebaucheryLibrarian 2024-09-13 01:22:46 +02:00
parent b41317706f
commit d3f15a6a2b
4 changed files with 106 additions and 18 deletions

View File

@ -5,31 +5,83 @@ exports.up = async (knex) => {
table.integer('scene_id') table.integer('scene_id')
.notNullable() .notNullable()
.references('id') .references('id')
.inTable('releases'); .inTable('releases')
.onDelete('set null');
table.integer('user_id') table.integer('user_id')
.references('id') .references('id')
.inTable('users'); .inTable('users')
.onDelete('set null');
table.json('base'); table.json('base')
table.json('deltas'); .notNullable();
table.json('deltas')
.notNullable();
table.text('hash')
.notNullable();
table.text('comment'); table.text('comment');
table.datetime('applied_at'); table.boolean('approved');
table.integer('approved_by') table.integer('reviewed_by')
.references('id') .references('id')
.inTable('users'); .inTable('users')
.onDelete('set null');
table.text('error'); table.datetime('reviewed_at');
table.text('feedback');
table.datetime('applied_at');
table.datetime('created_at') table.datetime('created_at')
.notNullable() .notNullable()
.defaultTo(knex.fn.now()); .defaultTo(knex.fn.now());
}); });
await knex.schema.createTable('bans', (table) => {
table.increments('id');
table.integer('user_id')
.references('id')
.inTable('users')
.onDelete('set null');
table.string('username');
table.specificType('ip', 'cidr');
table.boolean('match_all')
.notNullable()
.defaultTo(false);
table.string('scope');
table.boolean('shadow');
table.integer('banned_by')
.references('id')
.inTable('users')
.onDelete('set null');
table.datetime('expires_at')
.notNullable();
table.datetime('created_at')
.notNullable()
.defaultTo(knex.fn.now());
});
await knex.schema.alterTable('users', (table) => {
table.specificType('last_ip', 'cidr');
});
}; };
exports.down = async (knex) => { exports.down = async (knex) => {
await knex.schema.dropTable('scenes_revisions'); await knex.schema.dropTable('scenes_revisions');
await knex.schema.dropTable('bans');
await knex.schema.alterTable('users', (table) => {
table.dropColumn('last_ip');
});
}; };

View File

@ -10734,6 +10734,15 @@ const sites = [
tags: ['stripper'], tags: ['stripper'],
parent: 'spizoo', parent: 'spizoo',
}, },
{
slug: 'gothgirlfriends',
name: 'Goth Girlfriends',
url: 'https://www.gothgirlfriends.com',
parent: 'spizoo',
parameters: {
latest: '/categories/videos_{page}_d.html',
},
},
{ {
slug: 'intimatelesbians', slug: 'intimatelesbians',
name: 'Intimate Lesbians', name: 'Intimate Lesbians',

View File

@ -1,5 +1,8 @@
'use strict'; 'use strict';
const unprint = require('unprint');
const format = require('template-format');
const qu = require('../utils/qu'); const qu = require('../utils/qu');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
@ -14,17 +17,19 @@ function scrapeAll(scenes) {
release.url = query.url('a'); release.url = query.url('a');
release.entryId = getEntryId(release.url); release.entryId = getEntryId(release.url);
release.title = query.cnt('.title-label a, .thumb-title a, .p-7, .text h3'); release.title = query.content('.title-label a, .thumb-title a, .p-7, .text h3');
release.date = query.date('.date-label', 'MM/DD/YYYY'); release.date = query.date('.date-label', 'MM/DD/YYYY');
release.actors = query.all(['.update_models a', '.tour_update_models a', '.pornstar-label span']).map((el) => ({ release.actors = query.all(['.update_models a', '.tour_update_models a', '.pornstar-label span']).map((el) => ({
name: query.cnt(el), name: unprint.query.content(el),
url: query.url(el, null), url: unprint.query.url(el, null),
})); }));
release.poster = query.img('a img'); release.poster = query.img('a img');
release.teaser = query.video('.leVideo source'); release.teaser = query.video('.leVideo source');
console.log(release);
return release; return release;
}); });
} }
@ -47,7 +52,7 @@ function scrapeScene({ query }, url) {
release.tags = query.cnts('.categories-holder a, #sceneInfo a[href*="/categories"], #trailer-data a[href*="/categories"]'); release.tags = query.cnts('.categories-holder a, #sceneInfo a[href*="/categories"], #trailer-data a[href*="/categories"]');
const poster = query.img(['#video-holder .update_thumb', '#noMore .update_thumb', '#hpromo .update_thumb', '.trailer-thumb']) || query.poster('#trailervideo'); const poster = query.img(['#video-holder .update_thumb', '#video-holder .thumb', '#noMore .update_thumb', '#hpromo .update_thumb', '.trailer-thumb']) || query.poster('#trailervideo');
const posterPathname = poster && new URL(poster)?.pathname; const posterPathname = poster && new URL(poster)?.pathname;
release.poster = [poster, poster?.replace(/imgw=\w+/, 'imgw=680')]; release.poster = [poster, poster?.replace(/imgw=\w+/, 'imgw=680')];
@ -62,6 +67,8 @@ function scrapeScene({ query }, url) {
release.trailer = query.video('#trailervideo source[type="video/mp4"], #FulsSizeVideo source[type="video/mp4"]'); // sic release.trailer = query.video('#trailervideo source[type="video/mp4"], #FulsSizeVideo source[type="video/mp4"]'); // sic
release.teaser = query.video('#trailer-video source[src*="/videothumbs"]'); release.teaser = query.video('#trailer-video source[src*="/videothumbs"]');
console.log(release);
return release; return release;
} }
@ -131,10 +138,14 @@ function scrapeProfile({ query, el }) {
} }
async function fetchLatest(channel, page) { async function fetchLatest(channel, page) {
const res = await qu.getAll(`${channel.url}/categories/movies_${page}_d.html`, '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail'); // const res = await qu.getAll(`${channel.url}/categories/movies_${page}_d.html`, '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail');
const res = await unprint.get(`${channel.url}${format(channel.parameters?.latest || '/categories/movies_{page}_d.html', { page })}`, {
selectAll: '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail',
});
if (res.ok) { if (res.ok) {
return scrapeAll(res.items, channel); return scrapeAll(res.context, channel);
} }
return res.status; return res.status;

View File

@ -71,7 +71,11 @@ async function findSiteByUrl(url) {
.leftJoin('networks', 'sites.network_id', 'networks.id') .leftJoin('networks', 'sites.network_id', 'networks.id')
.select( .select(
'sites.*', 'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', 'networks.name as network_name',
'networks.slug as network_slug',
'networks.url as network_url',
'networks.description as network_description',
'networks.parameters as network_parameters',
) )
.where('sites.url', url) .where('sites.url', url)
.orWhere('sites.url', origin) .orWhere('sites.url', origin)
@ -114,7 +118,11 @@ async function fetchSitesFromArgv() {
const rawSites = await knex('sites') const rawSites = await knex('sites')
.select( .select(
'sites.*', 'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', 'networks.name as network_name',
'networks.slug as network_slug',
'networks.url as network_url',
'networks.description as network_description',
'networks.parameters as network_parameters',
) )
.whereIn('sites.slug', argv.sites || []) .whereIn('sites.slug', argv.sites || [])
.orWhereIn('networks.slug', argv.networks || []) .orWhereIn('networks.slug', argv.networks || [])
@ -133,7 +141,11 @@ async function fetchSitesFromConfig() {
const rawSites = await knex('sites') const rawSites = await knex('sites')
.select( .select(
'sites.*', 'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', 'networks.name as network_name',
'networks.slug as network_slug',
'networks.url as network_url',
'networks.description as network_description',
'networks.parameters as network_parameters',
) )
.leftJoin('networks', 'sites.network_id', 'networks.id') .leftJoin('networks', 'sites.network_id', 'networks.id')
.where((builder) => { .where((builder) => {
@ -168,7 +180,11 @@ async function fetchSites(queryObject) {
.where((builder) => whereOr(queryObject, 'sites', builder)) .where((builder) => whereOr(queryObject, 'sites', builder))
.select( .select(
'sites.*', 'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', 'networks.name as network_name',
'networks.slug as network_slug',
'networks.url as network_url',
'networks.description as network_description',
'networks.parameters as network_parameters',
) )
.leftJoin('networks', 'sites.network_id', 'networks.id') .leftJoin('networks', 'sites.network_id', 'networks.id')
.limit(100); .limit(100);