Tweaked Spizoo scraper for Goth Girlfriends.
This commit is contained in:
parent
b41317706f
commit
d3f15a6a2b
|
@ -5,31 +5,83 @@ exports.up = async (knex) => {
|
||||||
table.integer('scene_id')
|
table.integer('scene_id')
|
||||||
.notNullable()
|
.notNullable()
|
||||||
.references('id')
|
.references('id')
|
||||||
.inTable('releases');
|
.inTable('releases')
|
||||||
|
.onDelete('set null');
|
||||||
|
|
||||||
table.integer('user_id')
|
table.integer('user_id')
|
||||||
.references('id')
|
.references('id')
|
||||||
.inTable('users');
|
.inTable('users')
|
||||||
|
.onDelete('set null');
|
||||||
|
|
||||||
table.json('base');
|
table.json('base')
|
||||||
table.json('deltas');
|
.notNullable();
|
||||||
|
|
||||||
|
table.json('deltas')
|
||||||
|
.notNullable();
|
||||||
|
|
||||||
|
table.text('hash')
|
||||||
|
.notNullable();
|
||||||
|
|
||||||
table.text('comment');
|
table.text('comment');
|
||||||
|
|
||||||
table.datetime('applied_at');
|
table.boolean('approved');
|
||||||
|
|
||||||
table.integer('approved_by')
|
table.integer('reviewed_by')
|
||||||
.references('id')
|
.references('id')
|
||||||
.inTable('users');
|
.inTable('users')
|
||||||
|
.onDelete('set null');
|
||||||
|
|
||||||
table.text('error');
|
table.datetime('reviewed_at');
|
||||||
|
table.text('feedback');
|
||||||
|
|
||||||
|
table.datetime('applied_at');
|
||||||
|
|
||||||
table.datetime('created_at')
|
table.datetime('created_at')
|
||||||
.notNullable()
|
.notNullable()
|
||||||
.defaultTo(knex.fn.now());
|
.defaultTo(knex.fn.now());
|
||||||
});
|
});
|
||||||
|
|
||||||
|
await knex.schema.createTable('bans', (table) => {
|
||||||
|
table.increments('id');
|
||||||
|
|
||||||
|
table.integer('user_id')
|
||||||
|
.references('id')
|
||||||
|
.inTable('users')
|
||||||
|
.onDelete('set null');
|
||||||
|
|
||||||
|
table.string('username');
|
||||||
|
table.specificType('ip', 'cidr');
|
||||||
|
|
||||||
|
table.boolean('match_all')
|
||||||
|
.notNullable()
|
||||||
|
.defaultTo(false);
|
||||||
|
|
||||||
|
table.string('scope');
|
||||||
|
table.boolean('shadow');
|
||||||
|
|
||||||
|
table.integer('banned_by')
|
||||||
|
.references('id')
|
||||||
|
.inTable('users')
|
||||||
|
.onDelete('set null');
|
||||||
|
|
||||||
|
table.datetime('expires_at')
|
||||||
|
.notNullable();
|
||||||
|
|
||||||
|
table.datetime('created_at')
|
||||||
|
.notNullable()
|
||||||
|
.defaultTo(knex.fn.now());
|
||||||
|
});
|
||||||
|
|
||||||
|
await knex.schema.alterTable('users', (table) => {
|
||||||
|
table.specificType('last_ip', 'cidr');
|
||||||
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
exports.down = async (knex) => {
|
exports.down = async (knex) => {
|
||||||
await knex.schema.dropTable('scenes_revisions');
|
await knex.schema.dropTable('scenes_revisions');
|
||||||
|
await knex.schema.dropTable('bans');
|
||||||
|
|
||||||
|
await knex.schema.alterTable('users', (table) => {
|
||||||
|
table.dropColumn('last_ip');
|
||||||
|
});
|
||||||
};
|
};
|
||||||
|
|
|
@ -10734,6 +10734,15 @@ const sites = [
|
||||||
tags: ['stripper'],
|
tags: ['stripper'],
|
||||||
parent: 'spizoo',
|
parent: 'spizoo',
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
slug: 'gothgirlfriends',
|
||||||
|
name: 'Goth Girlfriends',
|
||||||
|
url: 'https://www.gothgirlfriends.com',
|
||||||
|
parent: 'spizoo',
|
||||||
|
parameters: {
|
||||||
|
latest: '/categories/videos_{page}_d.html',
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
slug: 'intimatelesbians',
|
slug: 'intimatelesbians',
|
||||||
name: 'Intimate Lesbians',
|
name: 'Intimate Lesbians',
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
|
const unprint = require('unprint');
|
||||||
|
const format = require('template-format');
|
||||||
|
|
||||||
const qu = require('../utils/qu');
|
const qu = require('../utils/qu');
|
||||||
const slugify = require('../utils/slugify');
|
const slugify = require('../utils/slugify');
|
||||||
|
|
||||||
|
@ -14,17 +17,19 @@ function scrapeAll(scenes) {
|
||||||
release.url = query.url('a');
|
release.url = query.url('a');
|
||||||
release.entryId = getEntryId(release.url);
|
release.entryId = getEntryId(release.url);
|
||||||
|
|
||||||
release.title = query.cnt('.title-label a, .thumb-title a, .p-7, .text h3');
|
release.title = query.content('.title-label a, .thumb-title a, .p-7, .text h3');
|
||||||
release.date = query.date('.date-label', 'MM/DD/YYYY');
|
release.date = query.date('.date-label', 'MM/DD/YYYY');
|
||||||
|
|
||||||
release.actors = query.all(['.update_models a', '.tour_update_models a', '.pornstar-label span']).map((el) => ({
|
release.actors = query.all(['.update_models a', '.tour_update_models a', '.pornstar-label span']).map((el) => ({
|
||||||
name: query.cnt(el),
|
name: unprint.query.content(el),
|
||||||
url: query.url(el, null),
|
url: unprint.query.url(el, null),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
release.poster = query.img('a img');
|
release.poster = query.img('a img');
|
||||||
release.teaser = query.video('.leVideo source');
|
release.teaser = query.video('.leVideo source');
|
||||||
|
|
||||||
|
console.log(release);
|
||||||
|
|
||||||
return release;
|
return release;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -47,7 +52,7 @@ function scrapeScene({ query }, url) {
|
||||||
|
|
||||||
release.tags = query.cnts('.categories-holder a, #sceneInfo a[href*="/categories"], #trailer-data a[href*="/categories"]');
|
release.tags = query.cnts('.categories-holder a, #sceneInfo a[href*="/categories"], #trailer-data a[href*="/categories"]');
|
||||||
|
|
||||||
const poster = query.img(['#video-holder .update_thumb', '#noMore .update_thumb', '#hpromo .update_thumb', '.trailer-thumb']) || query.poster('#trailervideo');
|
const poster = query.img(['#video-holder .update_thumb', '#video-holder .thumb', '#noMore .update_thumb', '#hpromo .update_thumb', '.trailer-thumb']) || query.poster('#trailervideo');
|
||||||
const posterPathname = poster && new URL(poster)?.pathname;
|
const posterPathname = poster && new URL(poster)?.pathname;
|
||||||
|
|
||||||
release.poster = [poster, poster?.replace(/imgw=\w+/, 'imgw=680')];
|
release.poster = [poster, poster?.replace(/imgw=\w+/, 'imgw=680')];
|
||||||
|
@ -62,6 +67,8 @@ function scrapeScene({ query }, url) {
|
||||||
release.trailer = query.video('#trailervideo source[type="video/mp4"], #FulsSizeVideo source[type="video/mp4"]'); // sic
|
release.trailer = query.video('#trailervideo source[type="video/mp4"], #FulsSizeVideo source[type="video/mp4"]'); // sic
|
||||||
release.teaser = query.video('#trailer-video source[src*="/videothumbs"]');
|
release.teaser = query.video('#trailer-video source[src*="/videothumbs"]');
|
||||||
|
|
||||||
|
console.log(release);
|
||||||
|
|
||||||
return release;
|
return release;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -131,10 +138,14 @@ function scrapeProfile({ query, el }) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchLatest(channel, page) {
|
async function fetchLatest(channel, page) {
|
||||||
const res = await qu.getAll(`${channel.url}/categories/movies_${page}_d.html`, '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail');
|
// const res = await qu.getAll(`${channel.url}/categories/movies_${page}_d.html`, '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail');
|
||||||
|
|
||||||
|
const res = await unprint.get(`${channel.url}${format(channel.parameters?.latest || '/categories/movies_{page}_d.html', { page })}`, {
|
||||||
|
selectAll: '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail',
|
||||||
|
});
|
||||||
|
|
||||||
if (res.ok) {
|
if (res.ok) {
|
||||||
return scrapeAll(res.items, channel);
|
return scrapeAll(res.context, channel);
|
||||||
}
|
}
|
||||||
|
|
||||||
return res.status;
|
return res.status;
|
||||||
|
|
24
src/sites.js
24
src/sites.js
|
@ -71,7 +71,11 @@ async function findSiteByUrl(url) {
|
||||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||||
.select(
|
.select(
|
||||||
'sites.*',
|
'sites.*',
|
||||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
'networks.name as network_name',
|
||||||
|
'networks.slug as network_slug',
|
||||||
|
'networks.url as network_url',
|
||||||
|
'networks.description as network_description',
|
||||||
|
'networks.parameters as network_parameters',
|
||||||
)
|
)
|
||||||
.where('sites.url', url)
|
.where('sites.url', url)
|
||||||
.orWhere('sites.url', origin)
|
.orWhere('sites.url', origin)
|
||||||
|
@ -114,7 +118,11 @@ async function fetchSitesFromArgv() {
|
||||||
const rawSites = await knex('sites')
|
const rawSites = await knex('sites')
|
||||||
.select(
|
.select(
|
||||||
'sites.*',
|
'sites.*',
|
||||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
'networks.name as network_name',
|
||||||
|
'networks.slug as network_slug',
|
||||||
|
'networks.url as network_url',
|
||||||
|
'networks.description as network_description',
|
||||||
|
'networks.parameters as network_parameters',
|
||||||
)
|
)
|
||||||
.whereIn('sites.slug', argv.sites || [])
|
.whereIn('sites.slug', argv.sites || [])
|
||||||
.orWhereIn('networks.slug', argv.networks || [])
|
.orWhereIn('networks.slug', argv.networks || [])
|
||||||
|
@ -133,7 +141,11 @@ async function fetchSitesFromConfig() {
|
||||||
const rawSites = await knex('sites')
|
const rawSites = await knex('sites')
|
||||||
.select(
|
.select(
|
||||||
'sites.*',
|
'sites.*',
|
||||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
'networks.name as network_name',
|
||||||
|
'networks.slug as network_slug',
|
||||||
|
'networks.url as network_url',
|
||||||
|
'networks.description as network_description',
|
||||||
|
'networks.parameters as network_parameters',
|
||||||
)
|
)
|
||||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||||
.where((builder) => {
|
.where((builder) => {
|
||||||
|
@ -168,7 +180,11 @@ async function fetchSites(queryObject) {
|
||||||
.where((builder) => whereOr(queryObject, 'sites', builder))
|
.where((builder) => whereOr(queryObject, 'sites', builder))
|
||||||
.select(
|
.select(
|
||||||
'sites.*',
|
'sites.*',
|
||||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
'networks.name as network_name',
|
||||||
|
'networks.slug as network_slug',
|
||||||
|
'networks.url as network_url',
|
||||||
|
'networks.description as network_description',
|
||||||
|
'networks.parameters as network_parameters',
|
||||||
)
|
)
|
||||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||||
.limit(100);
|
.limit(100);
|
||||||
|
|
Loading…
Reference in New Issue