Switched to tabs. Adding missing actor entries when scraping actors, with batch ID.

This commit is contained in:
ThePendulum 2020-05-14 04:26:05 +02:00
parent f1eb29c713
commit 11eb66f834
178 changed files with 16594 additions and 16929 deletions

View File

@ -5,7 +5,7 @@ root = true
[*] [*]
end_of_line = lf end_of_line = lf
insert_final_newline = true insert_final_newline = true
indent_style = space indent_style = tab
indent_size = 4 indent_size = 4
# Matches multiple files with brace expansion notation # Matches multiple files with brace expansion notation

View File

@ -7,13 +7,14 @@
"sourceType": "module" "sourceType": "module"
}, },
"rules": { "rules": {
"indent": ["error", "tab"],
"no-tabs": "off",
"no-unused-vars": ["error", {"argsIgnorePattern": "^_"}], "no-unused-vars": ["error", {"argsIgnorePattern": "^_"}],
"no-console": 0, "no-console": 0,
"indent": "off",
"template-curly-spacing": "off", "template-curly-spacing": "off",
"max-len": 0, "max-len": 0,
"vue/no-v-html": 0, "vue/no-v-html": 0,
"vue/html-indent": ["error", 4], "vue/html-indent": ["error", "tab"],
"vue/multiline-html-element-content-newline": 0, "vue/multiline-html-element-content-newline": 0,
"vue/singleline-html-element-content-newline": 0, "vue/singleline-html-element-content-newline": 0,
"no-param-reassign": ["error", { "no-param-reassign": ["error", {

View File

@ -198,7 +198,7 @@
<span v-else>Yes</span> <span v-else>Yes</span>
</li> </li>
<li class="bio-item scraped hideable">Updated {{ formatDate(actor.scrapedAt, 'YYYY-MM-DD HH:mm') }}, ID: {{ actor.id }}</li> <li class="bio-item scraped hideable">Updated {{ formatDate(actor.updatedAt, 'YYYY-MM-DD HH:mm') }}, ID: {{ actor.id }}</li>
</ul> </ul>
<span <span

View File

@ -22,7 +22,7 @@
class="title" class="title"
> >
<img <img
:src="`/img/logos/${network.slug}/network.png`" :src="`/img/logos/${network.slug}/thumbs/network.png`"
class="logo" class="logo"
> >
</a> </a>
@ -82,7 +82,7 @@
class="title" class="title"
> >
<img <img
:src="`/img/logos/${network.slug}/network.png`" :src="`/img/logos/${network.slug}/thumbs/network.png`"
class="logo" class="logo"
> >
</a> </a>

View File

@ -5,7 +5,7 @@
class="tile" class="tile"
> >
<img <img
:src="`/img/logos/${site.network.slug}/${site.slug}.png`" :src="`/img/logos/${site.network.slug}/thumbs/${site.slug}.png`"
:alt="site.name" :alt="site.name"
class="logo" class="logo"
> >

View File

@ -32,6 +32,8 @@ function curateActor(actor) {
state: actor.residenceState, state: actor.residenceState,
country: actor.residenceCountry, country: actor.residenceCountry,
}, },
scrapedAt: new Date(actor.createdAt),
updatedAt: new Date(actor.updatedAt),
}; };
if (actor.avatar) { if (actor.avatar) {
@ -90,6 +92,8 @@ function initActorActions(store, _router) {
tattoos tattoos
piercings piercings
description description
createdAt
updatedAt
network { network {
id id
name name

View File

@ -30,6 +30,7 @@ module.exports = {
'marycarey', 'marycarey',
'racqueldevonshire', 'racqueldevonshire',
]], ]],
'boobpedia',
['blowpass', ['sunlustxxx']], ['blowpass', ['sunlustxxx']],
['ddfnetwork', [ ['ddfnetwork', [
'fuckinhd', 'fuckinhd',
@ -39,6 +40,7 @@ module.exports = {
'daringsex', 'daringsex',
'lowartfilms', 'lowartfilms',
]], ]],
'freeones',
['pornpros', [ ['pornpros', [
'milfhumiliation', 'milfhumiliation',
'humiliated', 'humiliated',
@ -57,6 +59,9 @@ module.exports = {
'scorelandtv', 'scorelandtv',
'scoretv', 'scoretv',
]], ]],
['mindgeek', [
'pornhub',
]],
], ],
profiles: [ profiles: [
[ [
@ -147,7 +152,6 @@ module.exports = {
'boobpedia', 'boobpedia',
'pornhub', 'pornhub',
'freeones', 'freeones',
'freeonesLegacy',
], ],
proxy: { proxy: {
enable: false, enable: false,

View File

@ -247,6 +247,13 @@ exports.up = knex => Promise.resolve()
table.datetime('created_at') table.datetime('created_at')
.defaultTo(knex.fn.now()); .defaultTo(knex.fn.now());
})) }))
.then(() => knex.schema.createTable('batches', (table) => {
table.increments('id', 12);
table.text('comment');
table.datetime('created_at')
.defaultTo(knex.fn.now());
}))
.then(() => knex.schema.createTable('actors', (table) => { .then(() => knex.schema.createTable('actors', (table) => {
table.increments('id', 12); table.increments('id', 12);
@ -298,7 +305,9 @@ exports.up = knex => Promise.resolve()
table.string('piercings'); table.string('piercings');
table.string('tattoos'); table.string('tattoos');
table.integer('batch_id', 12); table.integer('batch_id', 12)
.references('id')
.inTable('batches');
table.datetime('updated_at') table.datetime('updated_at')
.defaultTo(knex.fn.now()); .defaultTo(knex.fn.now());
@ -317,6 +326,13 @@ exports.up = knex => Promise.resolve()
.references('id') .references('id')
.inTable('networks'); .inTable('networks');
table.integer('site_id', 12)
.references('id')
.inTable('sites');
table.unique(['actor_id', 'network_id']);
table.unique(['actor_id', 'site_id']);
table.date('birthdate'); table.date('birthdate');
table.string('gender', 18); table.string('gender', 18);
table.text('description'); table.text('description');
@ -575,21 +591,17 @@ exports.up = knex => Promise.resolve()
table.datetime('created_at') table.datetime('created_at')
.defaultTo(knex.fn.now()); .defaultTo(knex.fn.now());
})) }))
.then(() => knex.schema.createTable('batches', (table) => {
table.increments('id', 12);
table.text('comment');
table.datetime('created_at')
.defaultTo(knex.fn.now());
}))
.then(() => knex.schema.createTable('releases', (table) => { .then(() => knex.schema.createTable('releases', (table) => {
table.increments('id', 16); table.increments('id', 16);
table.integer('site_id', 12) table.integer('site_id', 12)
.notNullable()
.references('id') .references('id')
.inTable('sites'); .inTable('sites');
table.integer('network_id', 12)
.references('id')
.inTable('networks');
table.integer('studio_id', 12) table.integer('studio_id', 12)
.references('id') .references('id')
.inTable('studios'); .inTable('studios');
@ -599,7 +611,7 @@ exports.up = knex => Promise.resolve()
table.string('shoot_id'); table.string('shoot_id');
table.string('entry_id'); table.string('entry_id');
table.unique(['site_id', 'entry_id', 'type']); table.unique(['site_id', 'network_id', 'entry_id', 'type']);
table.string('url', 1000); table.string('url', 1000);
table.string('title'); table.string('title');
@ -753,6 +765,9 @@ exports.up = knex => Promise.resolve()
.inTable('releases'); .inTable('releases');
})) }))
.then(() => knex.raw(` .then(() => knex.raw(`
ALTER TABLE releases
ADD CONSTRAINT ensure_site_or_network CHECK (site_id IS NOT NULL OR network_id IS NOT NULL);
ALTER TABLE releases_search ALTER TABLE releases_search
ADD COLUMN document tsvector; ADD COLUMN document tsvector;

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.9 KiB

After

Width:  |  Height:  |  Size: 2.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.1 KiB

After

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.0 KiB

After

Width:  |  Height:  |  Size: 4.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.9 KiB

After

Width:  |  Height:  |  Size: 4.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.4 KiB

After

Width:  |  Height:  |  Size: 4.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 27 KiB

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 13 KiB

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.8 KiB

After

Width:  |  Height:  |  Size: 7.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.5 KiB

After

Width:  |  Height:  |  Size: 5.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 684 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

View File

@ -3066,6 +3066,13 @@ const sites = [
network: 'men', network: 'men',
}, },
// MINDGEEK // MINDGEEK
{
slug: 'pornhub',
name: 'PornHub',
url: 'https://www.pornhub.com',
description: '',
network: 'mindgeek',
},
{ {
slug: 'tube8vip', slug: 'tube8vip',
name: 'Tube8Vip', name: 'Tube8Vip',

View File

@ -14,7 +14,7 @@ const tagPosters = [
['blowbang', 'poster', 'Marsha May in "Feeding Frenzy 12" for Jules Jordan'], ['blowbang', 'poster', 'Marsha May in "Feeding Frenzy 12" for Jules Jordan'],
['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'], ['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'],
['brunette', 0, 'Nicole Black in GIO971 for LegalPorno'], ['brunette', 0, 'Nicole Black in GIO971 for LegalPorno'],
['bukkake', 'poster', 'Mia Malkova in "Facialized 2" for HardX'], ['bukkake', 0, 'Jaye Summers in "Facialized 5" for HardX'],
['caucasian', 0, 'Remy Lacroix for HardX'], ['caucasian', 0, 'Remy Lacroix for HardX'],
['creampie', 'poster', 'ALina Lopez in "Making Yourself Unforgettable" for Blacked'], ['creampie', 'poster', 'ALina Lopez in "Making Yourself Unforgettable" for Blacked'],
['cum-in-mouth', 1, 'Sarah Vandella in "Blow Bang Vandella" for HardX'], ['cum-in-mouth', 1, 'Sarah Vandella in "Blow Bang Vandella" for HardX'],
@ -76,6 +76,7 @@ const tagPhotos = [
// ['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'], // ['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'],
// ['anal', 0, 'Veronica Leal'], // ['anal', 0, 'Veronica Leal'],
['behind-the-scenes', 1, 'Madison Ivy in "Day With A Pornstar" for Brazzers'], ['behind-the-scenes', 1, 'Madison Ivy in "Day With A Pornstar" for Brazzers'],
['bukkake', 'poster', 'Mia Malkova in "Facialized 2" for HardX'],
['caucasian', 1, 'Sheena Shaw for Brazzers'], ['caucasian', 1, 'Sheena Shaw for Brazzers'],
['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'], ['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'],
['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'], ['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'],

View File

@ -6,10 +6,11 @@
}, },
"rules": { "rules": {
"strict": 0, "strict": 0,
"indent": ["error", "tab"],
"no-tabs": "off",
"no-unused-vars": ["error", {"argsIgnorePattern": "^_"}], "no-unused-vars": ["error", {"argsIgnorePattern": "^_"}],
"no-console": 0, "no-console": 0,
"no-underscore-dangle": 0, "no-underscore-dangle": 0,
"indent": "off",
"prefer-destructuring": "off", "prefer-destructuring": "off",
"template-curly-spacing": "off", "template-curly-spacing": "off",
"object-curly-newline": "off", "object-curly-newline": "off",

View File

@ -1,7 +1,13 @@
'use strict'; 'use strict';
const config = require('config');
const Promise = require('bluebird');
// const logger = require('./logger')(__filename); // const logger = require('./logger')(__filename);
const knex = require('./knex'); const knex = require('./knex');
const scrapers = require('./scrapers/scrapers');
const argv = require('./argv');
const slugify = require('./utils/slugify'); const slugify = require('./utils/slugify');
const capitalize = require('./utils/capitalize'); const capitalize = require('./utils/capitalize');
@ -13,7 +19,7 @@ function toBaseActors(actorsOrNames, release) {
const baseActor = { const baseActor = {
name, name,
slug, slug,
network: release.site.network, network: release?.site.network,
}; };
if (actorOrName.name) { if (actorOrName.name) {
@ -40,8 +46,34 @@ function curateActorEntries(baseActors, batchId) {
return baseActors.map(baseActor => curateActorEntry(baseActor, batchId)); return baseActors.map(baseActor => curateActorEntry(baseActor, batchId));
} }
async function scrapeProfiles() { async function scrapeActors(actorNames) {
const baseActors = toBaseActors(actorNames);
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
const siteSlugs = sources.flat();
const [networks, sites, existingActorEntries] = await Promise.all([
knex('networks').whereIn('slug', siteSlugs),
knex('sites').whereIn('slug', siteSlugs),
knex('actors')
.select(['id', 'name', 'slug'])
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
.whereNull('network_id'),
]);
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: { ...network, isNetwork: true } }), {});
const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
const [batchId] = newBaseActors.length > 0 ? await knex('batches').insert({ comment: null }).returning('id') : [null];
const curatedActorEntries = batchId && curateActorEntries(newBaseActors, batchId);
const newActorEntries = batchId && await knex('actors').insert(curatedActorEntries).returning(['id', 'name', 'slug']);
const actorEntries = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
console.log(actorEntries, newActorEntries, actorEntries);
} }
async function getOrCreateActors(baseActors, batchId) { async function getOrCreateActors(baseActors, batchId) {
@ -98,7 +130,7 @@ async function associateActors(releases, batchId) {
const uniqueBaseActors = Object.values(baseActorsBySlugAndNetworkId).map(baseActorsByNetworkId => Object.values(baseActorsByNetworkId)).flat(); const uniqueBaseActors = Object.values(baseActorsBySlugAndNetworkId).map(baseActorsByNetworkId => Object.values(baseActorsByNetworkId)).flat();
const actors = await getOrCreateActors(uniqueBaseActors, batchId); const actors = await getOrCreateActors(uniqueBaseActors, batchId);
console.log(actors);
const actorIdsBySlugAndNetworkId = actors.reduce((acc, actor) => ({ const actorIdsBySlugAndNetworkId = actors.reduce((acc, actor) => ({
...acc, ...acc,
[actor.network_id]: { [actor.network_id]: {
@ -107,8 +139,6 @@ async function associateActors(releases, batchId) {
}, },
}), {}); }), {});
console.log(actorIdsBySlugAndNetworkId);
const releaseActorAssociations = Object.entries(baseActorsByReleaseId) const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
.map(([releaseId, releaseActors]) => releaseActors .map(([releaseId, releaseActors]) => releaseActors
.map(releaseActor => ({ .map(releaseActor => ({
@ -122,4 +152,5 @@ async function associateActors(releases, batchId) {
module.exports = { module.exports = {
associateActors, associateActors,
scrapeActors,
}; };

View File

@ -7,7 +7,7 @@ const knex = require('./knex');
const fetchUpdates = require('./updates'); const fetchUpdates = require('./updates');
const { fetchScenes, fetchMovies } = require('./deep'); const { fetchScenes, fetchMovies } = require('./deep');
const { storeReleases, updateReleasesSearch } = require('./store-releases'); const { storeReleases, updateReleasesSearch } = require('./store-releases');
const { scrapeActors } = require('./actors-legacy'); const { scrapeActors } = require('./actors');
async function init() { async function init() {
if (argv.server) { if (argv.server) {

View File

@ -42,7 +42,7 @@ async function findSites(baseReleases) {
const sites = await curateSites(siteEntries, true, false); const sites = await curateSites(siteEntries, true, false);
const networks = await curateNetworks(networkEntries, true, false, false); const networks = await curateNetworks(networkEntries, true, false, false);
const markedNetworks = networks.map(network => ({ ...network, isFallback: true })); const markedNetworks = networks.map(network => ({ ...network, isNetwork: true }));
const sitesBySlug = [] const sitesBySlug = []
.concat(markedNetworks, sites) .concat(markedNetworks, sites)
@ -86,6 +86,7 @@ function toBaseReleases(baseReleasesOrUrls) {
async function scrapeRelease(baseRelease, sites, type = 'scene') { async function scrapeRelease(baseRelease, sites, type = 'scene') {
const site = baseRelease.site || sites[urlToSiteSlug(baseRelease.url)]; const site = baseRelease.site || sites[urlToSiteSlug(baseRelease.url)];
const siteWithFallbackNetwork = site.isNetwork ? { ...site, network: site } : site; // make site.network available, even when site is network fallback
if (!site) { if (!site) {
logger.warn(`No site available for ${baseRelease.url}`); logger.warn(`No site available for ${baseRelease.url}`);
@ -115,8 +116,8 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') {
logger.verbose(`Fetching ${type} ${baseRelease.url}`); logger.verbose(`Fetching ${type} ${baseRelease.url}`);
const scrapedRelease = type === 'scene' const scrapedRelease = type === 'scene'
? await scraper.fetchScene(baseRelease.url, site, baseRelease, null, include) ? await scraper.fetchScene(baseRelease.url, siteWithFallbackNetwork, baseRelease, null, include)
: await scraper.fetchMovie(baseRelease.url, site, baseRelease, null, include); : await scraper.fetchMovie(baseRelease.url, siteWithFallbackNetwork, baseRelease, null, include);
const mergedRelease = { const mergedRelease = {
...baseRelease, ...baseRelease,

View File

@ -1,199 +0,0 @@
'use strict';
const config = require('config');
const Promise = require('bluebird');
const logger = require('./logger')(__filename);
const argv = require('./argv');
const include = require('./utils/argv-include')(argv);
const knex = require('./knex');
const scrapers = require('./scrapers/scrapers');
const { findSiteByUrl } = require('./sites');
const { findNetworkByUrl } = require('./networks');
const { storeReleases } = require('./releases');
async function findSite(url, release) {
if (release?.site) return release.site;
if (!url) return null;
const site = await findSiteByUrl(url);
if (site) {
return site;
}
const network = await findNetworkByUrl(url);
if (network) {
return {
...network,
network,
isFallback: true,
};
}
return null;
}
async function scrapeRelease(source, basicRelease = null, type = 'scene', beforeFetchLatest) {
// profile scraper may return either URLs or pre-scraped scenes
const sourceIsUrlOrEmpty = typeof source === 'string' || source === undefined;
const url = sourceIsUrlOrEmpty ? source : source?.url;
const release = sourceIsUrlOrEmpty ? basicRelease : source;
const site = basicRelease?.site || await findSite(url, release);
if (!site) {
throw new Error(`Could not find site for ${url} in database`);
}
if (!argv.deep && release) {
return {
...release,
site,
};
}
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
if (!scraper) {
throw new Error(`Could not find scraper for ${url}`);
}
if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) {
if (release) {
logger.warn(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
return null;
}
throw new Error(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
}
if (!release) {
logger.info(`Scraping release from ${url}`);
}
const scrapedRelease = type === 'scene'
? await scraper.fetchScene(url, site, release, beforeFetchLatest, include)
: await scraper.fetchMovie(url, site, release, beforeFetchLatest, include);
return {
...release,
...scrapedRelease,
...(scrapedRelease && release?.tags && {
tags: release.tags.concat(scrapedRelease.tags),
}),
site,
};
}
async function accumulateMovies(releases) {
if (!argv.withMovies) return [];
const moviesByUrl = releases.reduce((acc, release) => {
if (!release.movie) return acc;
const movie = release.movie.url ? release.movie : { url: release.movie };
if (!acc[movie.url]) {
acc[movie.url] = {
...movie,
type: 'movie',
sceneIds: [],
};
}
acc[movie.url].sceneIds = acc[movie.url].sceneIds.concat(release.id);
return acc;
}, {});
const movies = await Promise.map(Object.values(moviesByUrl), async movie => scrapeRelease(movie, null, 'movie'));
const { releases: storedMovies } = await storeReleases(movies);
const movieAssociations = storedMovies.reduce((acc, movie) => acc.concat(movie.sceneIds.map(sceneId => ({
movie_id: movie.id,
scene_id: sceneId,
}))), []);
await knex('releases_movies').insert(movieAssociations);
// console.log(moviesByUrl);
return movies;
}
async function scrapeReleases(sources, type = 'scene') {
const scrapedReleases = await Promise.map(sources, async source => scrapeRelease(source, null, type), {
concurrency: 5,
}).filter(Boolean);
const curatedReleases = scrapedReleases.map(scrapedRelease => ({ ...scrapedRelease, type }));
if ((argv.scene || argv.movie) && argv.inspect) {
// only show when fetching from URL
}
if (argv.save) {
const { releases: storedReleases } = await storeReleases(curatedReleases);
await accumulateMovies(storedReleases);
if (storedReleases) {
logger.info(storedReleases.map(storedRelease => `\nhttp://${config.web.host}:${config.web.port}/scene/${storedRelease.id}/${storedRelease.slug}`).join(''));
}
return storedReleases;
}
return curatedReleases;
}
async function scrapeScenes(sources) {
return scrapeReleases(sources, 'scene');
}
async function scrapeMovies(sources) {
return scrapeReleases(sources, 'movie');
}
async function deepFetchReleases(baseReleases, beforeFetchLatest) {
const deepReleases = await Promise.map(baseReleases, async (release) => {
if (release.url || (release.path && release.site)) {
try {
const fullRelease = await scrapeRelease(release.url, release, 'scene', beforeFetchLatest);
if (fullRelease) {
return {
...release,
...fullRelease,
deep: true,
};
}
logger.warn(`Release scraper returned empty result for ${release.url}`);
return release;
} catch (error) {
logger.error(`Failed to scrape ${release.url}: ${error}`);
return {
...release,
deep: false,
};
}
}
return release;
}, {
concurrency: 2,
});
return deepReleases;
}
module.exports = {
deepFetchReleases,
scrapeMovies,
scrapeRelease,
scrapeReleases,
scrapeScenes,
};

View File

@ -1,184 +0,0 @@
'use strict';
const Promise = require('bluebird');
const moment = require('moment');
const argv = require('./argv');
const include = require('./utils/argv-include')(argv);
const logger = require('./logger')(__filename);
const knex = require('./knex');
const { fetchIncludedSites } = require('./sites');
const scrapers = require('./scrapers/scrapers');
const { deepFetchReleases } = require('./scrape-releases');
const { storeReleases } = require('./releases');
function getAfterDate() {
if (/\d{2,4}-\d{2}-\d{2,4}/.test(argv.after)) {
// using date
return moment
.utc(argv.after, ['YYYY-MM-DD', 'DD-MM-YYYY'])
.toDate();
}
// using time distance (e.g. "1 month")
return moment
.utc()
.subtract(...argv.after.split(' '))
.toDate();
}
async function findDuplicateReleaseIds(latestReleases, accReleases) {
const duplicateReleases = await knex('releases')
.whereIn('entry_id', latestReleases.map(({ entryId }) => entryId));
// include accumulated releases as duplicates to prevent an infinite
// loop when the next page contains the same releases as the previous
return new Set(duplicateReleases
.map(release => String(release.entry_id))
.concat(accReleases.map(release => String(release.entryId))));
}
async function scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases, afterDate = getAfterDate(), accReleases = [], page = argv.page) {
if (!argv.latest || !scraper.fetchLatest) {
return [];
}
const latestReleases = await scraper.fetchLatest(site, page, beforeFetchLatest, accSiteReleases, include);
if (!Array.isArray(latestReleases)) {
logger.warn(`Scraper returned ${latestReleases || 'null'} when fetching latest from '${site.name}' on '${site.network.name}'`);
return accReleases;
}
if (latestReleases.length === 0) {
return accReleases;
}
const latestReleasesWithSite = latestReleases.map(release => ({ ...release, site }));
const oldestReleaseOnPage = latestReleases.slice(-1)[0].date;
const duplicateReleaseIds = argv.redownload ? new Set() : await findDuplicateReleaseIds(latestReleases, accReleases);
const uniqueReleases = latestReleasesWithSite
.filter(release => !duplicateReleaseIds.has(String(release.entryId)) // release is already in database
&& (argv.last || !release.date || moment(release.date).isAfter(afterDate))); // release is older than specified date limit
logger.verbose(`${site.name}: Scraped page ${page}, ${uniqueReleases.length} unique recent releases`);
if (
uniqueReleases.length > 0
// && (oldestReleaseOnPage || page < argv.pages)
&& ((oldestReleaseOnPage
? moment(oldestReleaseOnPage).isAfter(afterDate)
: accReleases.length + uniqueReleases.length <= argv.nullDateLimit)
|| (argv.last && accReleases.length + uniqueReleases.length < argv.last))
) {
// oldest release on page is newer that specified date range, or latest count has not yet been met, fetch next page
return scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases, afterDate, accReleases.concat(uniqueReleases), page + 1);
}
if (argv.last && uniqueReleases.length >= argv.last) {
return accReleases.concat(uniqueReleases).slice(0, argv.last);
}
if (oldestReleaseOnPage) {
return accReleases.concat(uniqueReleases);
}
return accReleases.concat(uniqueReleases).slice(0, argv.nullDateLimit);
}
async function scrapeUpcomingReleases(scraper, site, beforeFetchLatest) {
if (argv.upcoming && scraper.fetchUpcoming) {
const upcomingReleases = await scraper.fetchUpcoming(site, 1, beforeFetchLatest, include);
return upcomingReleases
? upcomingReleases.map(release => ({ ...release, site, upcoming: true }))
: [];
}
return [];
}
async function scrapeSiteReleases(scraper, site, accSiteReleases) {
const beforeFetchLatest = await scraper.beforeFetchLatest?.(site, accSiteReleases);
const [newReleases, upcomingReleases] = await Promise.all([
scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases), // fetch basic release info from scene overview
scrapeUpcomingReleases(scraper, site, beforeFetchLatest, accSiteReleases), // fetch basic release info from upcoming overview
]);
if (argv.upcoming) {
logger.info(`${site.name}: ${argv.latest ? `Found ${newReleases.length}` : 'Ignoring'} latest releases,${argv.upcoming ? ' ' : ' ignoring '}${upcomingReleases.length || '0'} upcoming releases`);
}
const baseReleases = [...newReleases, ...upcomingReleases];
if (argv.deep) {
// follow URL for every release
return deepFetchReleases(baseReleases, beforeFetchLatest);
}
return baseReleases;
}
async function scrapeSite(site, network, accSiteReleases = []) {
if (site.parameters?.ignore) {
logger.warn(`Ignoring ${network.name}: ${site.name}`);
return [];
}
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
if (!scraper) {
logger.warn(`No scraper found for '${site.name}' (${site.slug})`);
return [];
}
try {
const siteReleases = await scrapeSiteReleases(scraper, site, accSiteReleases);
return siteReleases.map(release => ({ ...release, site }));
} catch (error) {
logger.error(`${site.name}: Failed to scrape releases: ${error.message}`);
return [];
}
}
async function scrapeSites() {
const networks = await fetchIncludedSites();
const scrapedNetworks = await Promise.map(networks, async (network) => {
if (network.parameters?.sequential) {
logger.info(`Scraping '${network.name}' sequentially`);
return Promise.reduce(network.sites, async (acc, site) => {
const accSiteReleases = await acc;
const siteReleases = await scrapeSite(site, network, accSiteReleases);
return accSiteReleases.concat(siteReleases);
}, Promise.resolve([]));
}
return Promise.map(network.sites, async site => scrapeSite(site, network), {
concurrency: network.parameters?.concurrency || 2,
});
},
{
// 5 networks at a time
concurrency: 5,
});
const releases = scrapedNetworks.flat(2);
if (argv.inspect) {
console.log(releases);
}
if (argv.save) {
await storeReleases(releases);
}
}
module.exports = scrapeSites;

View File

@ -7,7 +7,7 @@ const { fetchScene, fetchLatest, fetchUpcoming, fetchProfile } = require('./gamm
async function fetchSceneWrapper(url, site, baseRelease) { async function fetchSceneWrapper(url, site, baseRelease) {
const release = await fetchScene(url, site, baseRelease); const release = await fetchScene(url, site, baseRelease);
if (site.isFallback && release.channel) { if (site.isNetwork && release.channel) {
const channelUrl = url.replace('blowpass.com', `${release.channel}.com`); const channelUrl = url.replace('blowpass.com', `${release.channel}.com`);
if (['onlyteenblowjobs', 'mommyblowsbest'].includes(release.channel)) { if (['onlyteenblowjobs', 'mommyblowsbest'].includes(release.channel)) {

View File

@ -392,9 +392,8 @@ async function fetchProfile(actorName, scraperSlug, site) {
: await get(`${site.url}/${t1}models/${actorSlugA}.html`); : await get(`${site.url}/${t1}models/${actorSlugA}.html`);
const res = (res1.ok && res1) const res = (res1.ok && res1)
|| (site.parameters?.profile || (site.parameters?.profile && await get(util.format(site.parameters.profile, actorSlugB)))
? await get(util.format(site.parameters.profile, actorSlugB)) || await get(`${site.url}/${t1}models/${actorSlugB}.html`);
: await get(`${site.url}/${t1}models/${actorSlugB}.html`));
if (!res.ok) return res.status; if (!res.ok) return res.status;
if (site.parameters?.t1) return scrapeProfileT1(res.item, site); if (site.parameters?.t1) return scrapeProfileT1(res.item, site);

View File

@ -118,7 +118,7 @@ async function fetchScene(url, site) {
const res = await bhttp.get(url); const res = await bhttp.get(url);
if (res.statusCode === 200) { if (res.statusCode === 200) {
if (site.isFallback) { if (site.isNetwork) {
const entryId = scrapeFallbackLanding(res.body.toString(), url); const entryId = scrapeFallbackLanding(res.body.toString(), url);
const fallbackRes = await bhttp.post('https://www.pervcity.com/set_popupvideo.php', { const fallbackRes = await bhttp.post('https://www.pervcity.com/set_popupvideo.php', {

View File

@ -19,7 +19,8 @@ function curateReleaseEntry(release, batchId, existingRelease) {
const curatedRelease = { const curatedRelease = {
title: release.title, title: release.title,
entry_id: release.entryId || null, entry_id: release.entryId || null,
site_id: release.site.id, site_id: release.site?.id,
network_id: release.site ? null : release.network?.id, // prefer site ID if available
shoot_id: release.shootId || null, shoot_id: release.shootId || null,
studio_id: release.studio?.id || null, studio_id: release.studio?.id || null,
url: release.url, url: release.url,
@ -45,7 +46,7 @@ function curateReleaseEntry(release, batchId, existingRelease) {
} }
async function attachChannelSites(releases) { async function attachChannelSites(releases) {
const releasesWithoutSite = releases.filter(release => release.channel && (!release.site || release.site.isFallback)); const releasesWithoutSite = releases.filter(release => release.channel && (!release.site || release.site.isNetwork));
const channelSites = await knex('sites') const channelSites = await knex('sites')
.leftJoin('networks', 'networks.id', 'sites.network_id') .leftJoin('networks', 'networks.id', 'sites.network_id')
@ -56,7 +57,7 @@ async function attachChannelSites(releases) {
const releasesWithChannelSite = await Promise.all(releases const releasesWithChannelSite = await Promise.all(releases
.map(async (release) => { .map(async (release) => {
if (release.site && !release.site.isFallback) { if (release.site && !release.site.isNetwork) {
return release; return release;
} }
@ -69,6 +70,14 @@ async function attachChannelSites(releases) {
}; };
} }
if (release.site && release.site.isNetwork) {
return {
...release,
site: null,
network: release.site,
};
}
logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`); logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`);
return null; return null;
@ -197,6 +206,10 @@ async function updateReleasesSearch(releaseIds) {
} }
async function storeReleases(releases) { async function storeReleases(releases) {
if (releases.length === 0) {
return [];
}
const [batchId] = await knex('batches').insert({ comment: null }).returning('id'); const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
const releasesWithSites = await attachChannelSites(releases); const releasesWithSites = await attachChannelSites(releases);

View File

@ -1,29 +0,0 @@
'use strict';
const path = require('path');
const Promise = require('bluebird');
const fs = require('fs-extra');
const fetchScene = require('../scrape-releases');
const argv = require('../argv');
async function renameFiles() {
const filenames = await fs.readdir(process.cwd());
const curated = await Promise.map(filenames, async (filename) => {
const shootId = filename.split(' ')[1];
const scene = await fetchScene(`https://kink.com/shoot/${shootId}`);
if (argv.confirm) {
await fs.rename(path.join(process.cwd(), filename), path.join(process.cwd(), `${scene.filename}.mp4`));
}
return scene.filename;
}, {
concurrency: 5,
});
console.log(curated);
}
renameFiles();