Switched to tabs. Adding missing actor entries when scraping actors, with batch ID.
|
@ -5,7 +5,7 @@ root = true
|
|||
[*]
|
||||
end_of_line = lf
|
||||
insert_final_newline = true
|
||||
indent_style = space
|
||||
indent_style = tab
|
||||
indent_size = 4
|
||||
|
||||
# Matches multiple files with brace expansion notation
|
||||
|
|
|
@ -7,13 +7,14 @@
|
|||
"sourceType": "module"
|
||||
},
|
||||
"rules": {
|
||||
"indent": ["error", "tab"],
|
||||
"no-tabs": "off",
|
||||
"no-unused-vars": ["error", {"argsIgnorePattern": "^_"}],
|
||||
"no-console": 0,
|
||||
"indent": "off",
|
||||
"template-curly-spacing": "off",
|
||||
"max-len": 0,
|
||||
"vue/no-v-html": 0,
|
||||
"vue/html-indent": ["error", 4],
|
||||
"vue/html-indent": ["error", "tab"],
|
||||
"vue/multiline-html-element-content-newline": 0,
|
||||
"vue/singleline-html-element-content-newline": 0,
|
||||
"no-param-reassign": ["error", {
|
||||
|
|
|
@ -198,7 +198,7 @@
|
|||
<span v-else>Yes</span>
|
||||
</li>
|
||||
|
||||
<li class="bio-item scraped hideable">Updated {{ formatDate(actor.scrapedAt, 'YYYY-MM-DD HH:mm') }}, ID: {{ actor.id }}</li>
|
||||
<li class="bio-item scraped hideable">Updated {{ formatDate(actor.updatedAt, 'YYYY-MM-DD HH:mm') }}, ID: {{ actor.id }}</li>
|
||||
</ul>
|
||||
|
||||
<span
|
||||
|
|
|
@ -22,7 +22,7 @@
|
|||
class="title"
|
||||
>
|
||||
<img
|
||||
:src="`/img/logos/${network.slug}/network.png`"
|
||||
:src="`/img/logos/${network.slug}/thumbs/network.png`"
|
||||
class="logo"
|
||||
>
|
||||
</a>
|
||||
|
@ -82,7 +82,7 @@
|
|||
class="title"
|
||||
>
|
||||
<img
|
||||
:src="`/img/logos/${network.slug}/network.png`"
|
||||
:src="`/img/logos/${network.slug}/thumbs/network.png`"
|
||||
class="logo"
|
||||
>
|
||||
</a>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
class="tile"
|
||||
>
|
||||
<img
|
||||
:src="`/img/logos/${site.network.slug}/${site.slug}.png`"
|
||||
:src="`/img/logos/${site.network.slug}/thumbs/${site.slug}.png`"
|
||||
:alt="site.name"
|
||||
class="logo"
|
||||
>
|
||||
|
|
|
@ -32,6 +32,8 @@ function curateActor(actor) {
|
|||
state: actor.residenceState,
|
||||
country: actor.residenceCountry,
|
||||
},
|
||||
scrapedAt: new Date(actor.createdAt),
|
||||
updatedAt: new Date(actor.updatedAt),
|
||||
};
|
||||
|
||||
if (actor.avatar) {
|
||||
|
@ -90,6 +92,8 @@ function initActorActions(store, _router) {
|
|||
tattoos
|
||||
piercings
|
||||
description
|
||||
createdAt
|
||||
updatedAt
|
||||
network {
|
||||
id
|
||||
name
|
||||
|
|
|
@ -30,6 +30,7 @@ module.exports = {
|
|||
'marycarey',
|
||||
'racqueldevonshire',
|
||||
]],
|
||||
'boobpedia',
|
||||
['blowpass', ['sunlustxxx']],
|
||||
['ddfnetwork', [
|
||||
'fuckinhd',
|
||||
|
@ -39,6 +40,7 @@ module.exports = {
|
|||
'daringsex',
|
||||
'lowartfilms',
|
||||
]],
|
||||
'freeones',
|
||||
['pornpros', [
|
||||
'milfhumiliation',
|
||||
'humiliated',
|
||||
|
@ -57,6 +59,9 @@ module.exports = {
|
|||
'scorelandtv',
|
||||
'scoretv',
|
||||
]],
|
||||
['mindgeek', [
|
||||
'pornhub',
|
||||
]],
|
||||
],
|
||||
profiles: [
|
||||
[
|
||||
|
@ -147,7 +152,6 @@ module.exports = {
|
|||
'boobpedia',
|
||||
'pornhub',
|
||||
'freeones',
|
||||
'freeonesLegacy',
|
||||
],
|
||||
proxy: {
|
||||
enable: false,
|
||||
|
|
|
@ -247,6 +247,13 @@ exports.up = knex => Promise.resolve()
|
|||
table.datetime('created_at')
|
||||
.defaultTo(knex.fn.now());
|
||||
}))
|
||||
.then(() => knex.schema.createTable('batches', (table) => {
|
||||
table.increments('id', 12);
|
||||
table.text('comment');
|
||||
|
||||
table.datetime('created_at')
|
||||
.defaultTo(knex.fn.now());
|
||||
}))
|
||||
.then(() => knex.schema.createTable('actors', (table) => {
|
||||
table.increments('id', 12);
|
||||
|
||||
|
@ -298,7 +305,9 @@ exports.up = knex => Promise.resolve()
|
|||
table.string('piercings');
|
||||
table.string('tattoos');
|
||||
|
||||
table.integer('batch_id', 12);
|
||||
table.integer('batch_id', 12)
|
||||
.references('id')
|
||||
.inTable('batches');
|
||||
|
||||
table.datetime('updated_at')
|
||||
.defaultTo(knex.fn.now());
|
||||
|
@ -317,6 +326,13 @@ exports.up = knex => Promise.resolve()
|
|||
.references('id')
|
||||
.inTable('networks');
|
||||
|
||||
table.integer('site_id', 12)
|
||||
.references('id')
|
||||
.inTable('sites');
|
||||
|
||||
table.unique(['actor_id', 'network_id']);
|
||||
table.unique(['actor_id', 'site_id']);
|
||||
|
||||
table.date('birthdate');
|
||||
table.string('gender', 18);
|
||||
table.text('description');
|
||||
|
@ -575,21 +591,17 @@ exports.up = knex => Promise.resolve()
|
|||
table.datetime('created_at')
|
||||
.defaultTo(knex.fn.now());
|
||||
}))
|
||||
.then(() => knex.schema.createTable('batches', (table) => {
|
||||
table.increments('id', 12);
|
||||
table.text('comment');
|
||||
|
||||
table.datetime('created_at')
|
||||
.defaultTo(knex.fn.now());
|
||||
}))
|
||||
.then(() => knex.schema.createTable('releases', (table) => {
|
||||
table.increments('id', 16);
|
||||
|
||||
table.integer('site_id', 12)
|
||||
.notNullable()
|
||||
.references('id')
|
||||
.inTable('sites');
|
||||
|
||||
table.integer('network_id', 12)
|
||||
.references('id')
|
||||
.inTable('networks');
|
||||
|
||||
table.integer('studio_id', 12)
|
||||
.references('id')
|
||||
.inTable('studios');
|
||||
|
@ -599,7 +611,7 @@ exports.up = knex => Promise.resolve()
|
|||
|
||||
table.string('shoot_id');
|
||||
table.string('entry_id');
|
||||
table.unique(['site_id', 'entry_id', 'type']);
|
||||
table.unique(['site_id', 'network_id', 'entry_id', 'type']);
|
||||
|
||||
table.string('url', 1000);
|
||||
table.string('title');
|
||||
|
@ -753,6 +765,9 @@ exports.up = knex => Promise.resolve()
|
|||
.inTable('releases');
|
||||
}))
|
||||
.then(() => knex.raw(`
|
||||
ALTER TABLE releases
|
||||
ADD CONSTRAINT ensure_site_or_network CHECK (site_id IS NOT NULL OR network_id IS NOT NULL);
|
||||
|
||||
ALTER TABLE releases_search
|
||||
ADD COLUMN document tsvector;
|
||||
|
||||
|
|
After Width: | Height: | Size: 2.3 KiB |
Before Width: | Height: | Size: 2.9 KiB After Width: | Height: | Size: 2.9 KiB |
After Width: | Height: | Size: 2.5 KiB |
Before Width: | Height: | Size: 4.1 KiB After Width: | Height: | Size: 4.1 KiB |
After Width: | Height: | Size: 3.3 KiB |
Before Width: | Height: | Size: 4.0 KiB After Width: | Height: | Size: 4.0 KiB |
Before Width: | Height: | Size: 4.9 KiB After Width: | Height: | Size: 4.9 KiB |
Before Width: | Height: | Size: 4.4 KiB After Width: | Height: | Size: 4.4 KiB |
After Width: | Height: | Size: 4.0 KiB |
Before Width: | Height: | Size: 22 KiB After Width: | Height: | Size: 22 KiB |
After Width: | Height: | Size: 36 KiB |
After Width: | Height: | Size: 9.1 KiB |
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 15 KiB |
After Width: | Height: | Size: 7.9 KiB |
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 19 KiB |
Before Width: | Height: | Size: 27 KiB After Width: | Height: | Size: 27 KiB |
Before Width: | Height: | Size: 13 KiB After Width: | Height: | Size: 13 KiB |
Before Width: | Height: | Size: 36 KiB After Width: | Height: | Size: 36 KiB |
Before Width: | Height: | Size: 7.8 KiB After Width: | Height: | Size: 7.8 KiB |
Before Width: | Height: | Size: 5.5 KiB After Width: | Height: | Size: 5.5 KiB |
After Width: | Height: | Size: 684 KiB |
After Width: | Height: | Size: 8.2 KiB |
After Width: | Height: | Size: 36 KiB |
|
@ -3066,6 +3066,13 @@ const sites = [
|
|||
network: 'men',
|
||||
},
|
||||
// MINDGEEK
|
||||
{
|
||||
slug: 'pornhub',
|
||||
name: 'PornHub',
|
||||
url: 'https://www.pornhub.com',
|
||||
description: '',
|
||||
network: 'mindgeek',
|
||||
},
|
||||
{
|
||||
slug: 'tube8vip',
|
||||
name: 'Tube8Vip',
|
||||
|
|
|
@ -14,7 +14,7 @@ const tagPosters = [
|
|||
['blowbang', 'poster', 'Marsha May in "Feeding Frenzy 12" for Jules Jordan'],
|
||||
['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'],
|
||||
['brunette', 0, 'Nicole Black in GIO971 for LegalPorno'],
|
||||
['bukkake', 'poster', 'Mia Malkova in "Facialized 2" for HardX'],
|
||||
['bukkake', 0, 'Jaye Summers in "Facialized 5" for HardX'],
|
||||
['caucasian', 0, 'Remy Lacroix for HardX'],
|
||||
['creampie', 'poster', 'ALina Lopez in "Making Yourself Unforgettable" for Blacked'],
|
||||
['cum-in-mouth', 1, 'Sarah Vandella in "Blow Bang Vandella" for HardX'],
|
||||
|
@ -76,6 +76,7 @@ const tagPhotos = [
|
|||
// ['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'],
|
||||
// ['anal', 0, 'Veronica Leal'],
|
||||
['behind-the-scenes', 1, 'Madison Ivy in "Day With A Pornstar" for Brazzers'],
|
||||
['bukkake', 'poster', 'Mia Malkova in "Facialized 2" for HardX'],
|
||||
['caucasian', 1, 'Sheena Shaw for Brazzers'],
|
||||
['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'],
|
||||
['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'],
|
||||
|
|
|
@ -6,10 +6,11 @@
|
|||
},
|
||||
"rules": {
|
||||
"strict": 0,
|
||||
"indent": ["error", "tab"],
|
||||
"no-tabs": "off",
|
||||
"no-unused-vars": ["error", {"argsIgnorePattern": "^_"}],
|
||||
"no-console": 0,
|
||||
"no-underscore-dangle": 0,
|
||||
"indent": "off",
|
||||
"prefer-destructuring": "off",
|
||||
"template-curly-spacing": "off",
|
||||
"object-curly-newline": "off",
|
||||
|
|
|
@ -1,7 +1,13 @@
|
|||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
|
||||
// const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const scrapers = require('./scrapers/scrapers');
|
||||
|
||||
const argv = require('./argv');
|
||||
const slugify = require('./utils/slugify');
|
||||
const capitalize = require('./utils/capitalize');
|
||||
|
||||
|
@ -13,7 +19,7 @@ function toBaseActors(actorsOrNames, release) {
|
|||
const baseActor = {
|
||||
name,
|
||||
slug,
|
||||
network: release.site.network,
|
||||
network: release?.site.network,
|
||||
};
|
||||
|
||||
if (actorOrName.name) {
|
||||
|
@ -40,8 +46,34 @@ function curateActorEntries(baseActors, batchId) {
|
|||
return baseActors.map(baseActor => curateActorEntry(baseActor, batchId));
|
||||
}
|
||||
|
||||
async function scrapeProfiles() {
|
||||
async function scrapeActors(actorNames) {
|
||||
const baseActors = toBaseActors(actorNames);
|
||||
|
||||
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
|
||||
const siteSlugs = sources.flat();
|
||||
|
||||
const [networks, sites, existingActorEntries] = await Promise.all([
|
||||
knex('networks').whereIn('slug', siteSlugs),
|
||||
knex('sites').whereIn('slug', siteSlugs),
|
||||
knex('actors')
|
||||
.select(['id', 'name', 'slug'])
|
||||
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
|
||||
.whereNull('network_id'),
|
||||
]);
|
||||
|
||||
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
|
||||
const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: { ...network, isNetwork: true } }), {});
|
||||
const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
|
||||
|
||||
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
|
||||
|
||||
const [batchId] = newBaseActors.length > 0 ? await knex('batches').insert({ comment: null }).returning('id') : [null];
|
||||
const curatedActorEntries = batchId && curateActorEntries(newBaseActors, batchId);
|
||||
const newActorEntries = batchId && await knex('actors').insert(curatedActorEntries).returning(['id', 'name', 'slug']);
|
||||
|
||||
const actorEntries = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
|
||||
|
||||
console.log(actorEntries, newActorEntries, actorEntries);
|
||||
}
|
||||
|
||||
async function getOrCreateActors(baseActors, batchId) {
|
||||
|
@ -98,7 +130,7 @@ async function associateActors(releases, batchId) {
|
|||
const uniqueBaseActors = Object.values(baseActorsBySlugAndNetworkId).map(baseActorsByNetworkId => Object.values(baseActorsByNetworkId)).flat();
|
||||
|
||||
const actors = await getOrCreateActors(uniqueBaseActors, batchId);
|
||||
console.log(actors);
|
||||
|
||||
const actorIdsBySlugAndNetworkId = actors.reduce((acc, actor) => ({
|
||||
...acc,
|
||||
[actor.network_id]: {
|
||||
|
@ -107,8 +139,6 @@ async function associateActors(releases, batchId) {
|
|||
},
|
||||
}), {});
|
||||
|
||||
console.log(actorIdsBySlugAndNetworkId);
|
||||
|
||||
const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
|
||||
.map(([releaseId, releaseActors]) => releaseActors
|
||||
.map(releaseActor => ({
|
||||
|
@ -122,4 +152,5 @@ async function associateActors(releases, batchId) {
|
|||
|
||||
module.exports = {
|
||||
associateActors,
|
||||
scrapeActors,
|
||||
};
|
||||
|
|
|
@ -7,7 +7,7 @@ const knex = require('./knex');
|
|||
const fetchUpdates = require('./updates');
|
||||
const { fetchScenes, fetchMovies } = require('./deep');
|
||||
const { storeReleases, updateReleasesSearch } = require('./store-releases');
|
||||
const { scrapeActors } = require('./actors-legacy');
|
||||
const { scrapeActors } = require('./actors');
|
||||
|
||||
async function init() {
|
||||
if (argv.server) {
|
||||
|
|
|
@ -42,7 +42,7 @@ async function findSites(baseReleases) {
|
|||
|
||||
const sites = await curateSites(siteEntries, true, false);
|
||||
const networks = await curateNetworks(networkEntries, true, false, false);
|
||||
const markedNetworks = networks.map(network => ({ ...network, isFallback: true }));
|
||||
const markedNetworks = networks.map(network => ({ ...network, isNetwork: true }));
|
||||
|
||||
const sitesBySlug = []
|
||||
.concat(markedNetworks, sites)
|
||||
|
@ -86,6 +86,7 @@ function toBaseReleases(baseReleasesOrUrls) {
|
|||
|
||||
async function scrapeRelease(baseRelease, sites, type = 'scene') {
|
||||
const site = baseRelease.site || sites[urlToSiteSlug(baseRelease.url)];
|
||||
const siteWithFallbackNetwork = site.isNetwork ? { ...site, network: site } : site; // make site.network available, even when site is network fallback
|
||||
|
||||
if (!site) {
|
||||
logger.warn(`No site available for ${baseRelease.url}`);
|
||||
|
@ -115,8 +116,8 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') {
|
|||
logger.verbose(`Fetching ${type} ${baseRelease.url}`);
|
||||
|
||||
const scrapedRelease = type === 'scene'
|
||||
? await scraper.fetchScene(baseRelease.url, site, baseRelease, null, include)
|
||||
: await scraper.fetchMovie(baseRelease.url, site, baseRelease, null, include);
|
||||
? await scraper.fetchScene(baseRelease.url, siteWithFallbackNetwork, baseRelease, null, include)
|
||||
: await scraper.fetchMovie(baseRelease.url, siteWithFallbackNetwork, baseRelease, null, include);
|
||||
|
||||
const mergedRelease = {
|
||||
...baseRelease,
|
||||
|
|
|
@ -1,199 +0,0 @@
|
|||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
|
||||
const logger = require('./logger')(__filename);
|
||||
const argv = require('./argv');
|
||||
const include = require('./utils/argv-include')(argv);
|
||||
const knex = require('./knex');
|
||||
const scrapers = require('./scrapers/scrapers');
|
||||
const { findSiteByUrl } = require('./sites');
|
||||
const { findNetworkByUrl } = require('./networks');
|
||||
const { storeReleases } = require('./releases');
|
||||
|
||||
async function findSite(url, release) {
|
||||
if (release?.site) return release.site;
|
||||
if (!url) return null;
|
||||
|
||||
const site = await findSiteByUrl(url);
|
||||
|
||||
if (site) {
|
||||
return site;
|
||||
}
|
||||
|
||||
const network = await findNetworkByUrl(url);
|
||||
|
||||
if (network) {
|
||||
return {
|
||||
...network,
|
||||
network,
|
||||
isFallback: true,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function scrapeRelease(source, basicRelease = null, type = 'scene', beforeFetchLatest) {
|
||||
// profile scraper may return either URLs or pre-scraped scenes
|
||||
const sourceIsUrlOrEmpty = typeof source === 'string' || source === undefined;
|
||||
const url = sourceIsUrlOrEmpty ? source : source?.url;
|
||||
const release = sourceIsUrlOrEmpty ? basicRelease : source;
|
||||
|
||||
const site = basicRelease?.site || await findSite(url, release);
|
||||
|
||||
if (!site) {
|
||||
throw new Error(`Could not find site for ${url} in database`);
|
||||
}
|
||||
|
||||
if (!argv.deep && release) {
|
||||
return {
|
||||
...release,
|
||||
site,
|
||||
};
|
||||
}
|
||||
|
||||
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
|
||||
|
||||
if (!scraper) {
|
||||
throw new Error(`Could not find scraper for ${url}`);
|
||||
}
|
||||
|
||||
if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) {
|
||||
if (release) {
|
||||
logger.warn(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
|
||||
return null;
|
||||
}
|
||||
|
||||
throw new Error(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
|
||||
}
|
||||
|
||||
if (!release) {
|
||||
logger.info(`Scraping release from ${url}`);
|
||||
}
|
||||
|
||||
const scrapedRelease = type === 'scene'
|
||||
? await scraper.fetchScene(url, site, release, beforeFetchLatest, include)
|
||||
: await scraper.fetchMovie(url, site, release, beforeFetchLatest, include);
|
||||
|
||||
return {
|
||||
...release,
|
||||
...scrapedRelease,
|
||||
...(scrapedRelease && release?.tags && {
|
||||
tags: release.tags.concat(scrapedRelease.tags),
|
||||
}),
|
||||
site,
|
||||
};
|
||||
}
|
||||
|
||||
async function accumulateMovies(releases) {
|
||||
if (!argv.withMovies) return [];
|
||||
|
||||
const moviesByUrl = releases.reduce((acc, release) => {
|
||||
if (!release.movie) return acc;
|
||||
const movie = release.movie.url ? release.movie : { url: release.movie };
|
||||
|
||||
if (!acc[movie.url]) {
|
||||
acc[movie.url] = {
|
||||
...movie,
|
||||
type: 'movie',
|
||||
sceneIds: [],
|
||||
};
|
||||
}
|
||||
|
||||
acc[movie.url].sceneIds = acc[movie.url].sceneIds.concat(release.id);
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
const movies = await Promise.map(Object.values(moviesByUrl), async movie => scrapeRelease(movie, null, 'movie'));
|
||||
const { releases: storedMovies } = await storeReleases(movies);
|
||||
|
||||
const movieAssociations = storedMovies.reduce((acc, movie) => acc.concat(movie.sceneIds.map(sceneId => ({
|
||||
movie_id: movie.id,
|
||||
scene_id: sceneId,
|
||||
}))), []);
|
||||
|
||||
await knex('releases_movies').insert(movieAssociations);
|
||||
|
||||
// console.log(moviesByUrl);
|
||||
return movies;
|
||||
}
|
||||
|
||||
async function scrapeReleases(sources, type = 'scene') {
|
||||
const scrapedReleases = await Promise.map(sources, async source => scrapeRelease(source, null, type), {
|
||||
concurrency: 5,
|
||||
}).filter(Boolean);
|
||||
|
||||
const curatedReleases = scrapedReleases.map(scrapedRelease => ({ ...scrapedRelease, type }));
|
||||
|
||||
if ((argv.scene || argv.movie) && argv.inspect) {
|
||||
// only show when fetching from URL
|
||||
}
|
||||
|
||||
if (argv.save) {
|
||||
const { releases: storedReleases } = await storeReleases(curatedReleases);
|
||||
|
||||
await accumulateMovies(storedReleases);
|
||||
|
||||
if (storedReleases) {
|
||||
logger.info(storedReleases.map(storedRelease => `\nhttp://${config.web.host}:${config.web.port}/scene/${storedRelease.id}/${storedRelease.slug}`).join(''));
|
||||
}
|
||||
|
||||
return storedReleases;
|
||||
}
|
||||
|
||||
return curatedReleases;
|
||||
}
|
||||
|
||||
async function scrapeScenes(sources) {
|
||||
return scrapeReleases(sources, 'scene');
|
||||
}
|
||||
|
||||
async function scrapeMovies(sources) {
|
||||
return scrapeReleases(sources, 'movie');
|
||||
}
|
||||
|
||||
async function deepFetchReleases(baseReleases, beforeFetchLatest) {
|
||||
const deepReleases = await Promise.map(baseReleases, async (release) => {
|
||||
if (release.url || (release.path && release.site)) {
|
||||
try {
|
||||
const fullRelease = await scrapeRelease(release.url, release, 'scene', beforeFetchLatest);
|
||||
|
||||
if (fullRelease) {
|
||||
return {
|
||||
...release,
|
||||
...fullRelease,
|
||||
deep: true,
|
||||
};
|
||||
}
|
||||
|
||||
logger.warn(`Release scraper returned empty result for ${release.url}`);
|
||||
|
||||
return release;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to scrape ${release.url}: ${error}`);
|
||||
|
||||
return {
|
||||
...release,
|
||||
deep: false,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return release;
|
||||
}, {
|
||||
concurrency: 2,
|
||||
});
|
||||
|
||||
return deepReleases;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
deepFetchReleases,
|
||||
scrapeMovies,
|
||||
scrapeRelease,
|
||||
scrapeReleases,
|
||||
scrapeScenes,
|
||||
};
|
|
@ -1,184 +0,0 @@
|
|||
'use strict';
|
||||
|
||||
const Promise = require('bluebird');
|
||||
const moment = require('moment');
|
||||
|
||||
const argv = require('./argv');
|
||||
const include = require('./utils/argv-include')(argv);
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const { fetchIncludedSites } = require('./sites');
|
||||
const scrapers = require('./scrapers/scrapers');
|
||||
const { deepFetchReleases } = require('./scrape-releases');
|
||||
const { storeReleases } = require('./releases');
|
||||
|
||||
function getAfterDate() {
|
||||
if (/\d{2,4}-\d{2}-\d{2,4}/.test(argv.after)) {
|
||||
// using date
|
||||
return moment
|
||||
.utc(argv.after, ['YYYY-MM-DD', 'DD-MM-YYYY'])
|
||||
.toDate();
|
||||
}
|
||||
|
||||
// using time distance (e.g. "1 month")
|
||||
return moment
|
||||
.utc()
|
||||
.subtract(...argv.after.split(' '))
|
||||
.toDate();
|
||||
}
|
||||
|
||||
async function findDuplicateReleaseIds(latestReleases, accReleases) {
|
||||
const duplicateReleases = await knex('releases')
|
||||
.whereIn('entry_id', latestReleases.map(({ entryId }) => entryId));
|
||||
|
||||
// include accumulated releases as duplicates to prevent an infinite
|
||||
// loop when the next page contains the same releases as the previous
|
||||
return new Set(duplicateReleases
|
||||
.map(release => String(release.entry_id))
|
||||
.concat(accReleases.map(release => String(release.entryId))));
|
||||
}
|
||||
|
||||
async function scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases, afterDate = getAfterDate(), accReleases = [], page = argv.page) {
|
||||
if (!argv.latest || !scraper.fetchLatest) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const latestReleases = await scraper.fetchLatest(site, page, beforeFetchLatest, accSiteReleases, include);
|
||||
|
||||
if (!Array.isArray(latestReleases)) {
|
||||
logger.warn(`Scraper returned ${latestReleases || 'null'} when fetching latest from '${site.name}' on '${site.network.name}'`);
|
||||
return accReleases;
|
||||
}
|
||||
|
||||
if (latestReleases.length === 0) {
|
||||
return accReleases;
|
||||
}
|
||||
|
||||
const latestReleasesWithSite = latestReleases.map(release => ({ ...release, site }));
|
||||
|
||||
const oldestReleaseOnPage = latestReleases.slice(-1)[0].date;
|
||||
const duplicateReleaseIds = argv.redownload ? new Set() : await findDuplicateReleaseIds(latestReleases, accReleases);
|
||||
|
||||
const uniqueReleases = latestReleasesWithSite
|
||||
.filter(release => !duplicateReleaseIds.has(String(release.entryId)) // release is already in database
|
||||
&& (argv.last || !release.date || moment(release.date).isAfter(afterDate))); // release is older than specified date limit
|
||||
|
||||
logger.verbose(`${site.name}: Scraped page ${page}, ${uniqueReleases.length} unique recent releases`);
|
||||
|
||||
if (
|
||||
uniqueReleases.length > 0
|
||||
// && (oldestReleaseOnPage || page < argv.pages)
|
||||
&& ((oldestReleaseOnPage
|
||||
? moment(oldestReleaseOnPage).isAfter(afterDate)
|
||||
: accReleases.length + uniqueReleases.length <= argv.nullDateLimit)
|
||||
|| (argv.last && accReleases.length + uniqueReleases.length < argv.last))
|
||||
) {
|
||||
// oldest release on page is newer that specified date range, or latest count has not yet been met, fetch next page
|
||||
return scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases, afterDate, accReleases.concat(uniqueReleases), page + 1);
|
||||
}
|
||||
|
||||
if (argv.last && uniqueReleases.length >= argv.last) {
|
||||
return accReleases.concat(uniqueReleases).slice(0, argv.last);
|
||||
}
|
||||
|
||||
if (oldestReleaseOnPage) {
|
||||
return accReleases.concat(uniqueReleases);
|
||||
}
|
||||
|
||||
return accReleases.concat(uniqueReleases).slice(0, argv.nullDateLimit);
|
||||
}
|
||||
|
||||
async function scrapeUpcomingReleases(scraper, site, beforeFetchLatest) {
|
||||
if (argv.upcoming && scraper.fetchUpcoming) {
|
||||
const upcomingReleases = await scraper.fetchUpcoming(site, 1, beforeFetchLatest, include);
|
||||
|
||||
return upcomingReleases
|
||||
? upcomingReleases.map(release => ({ ...release, site, upcoming: true }))
|
||||
: [];
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
async function scrapeSiteReleases(scraper, site, accSiteReleases) {
|
||||
const beforeFetchLatest = await scraper.beforeFetchLatest?.(site, accSiteReleases);
|
||||
|
||||
const [newReleases, upcomingReleases] = await Promise.all([
|
||||
scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases), // fetch basic release info from scene overview
|
||||
scrapeUpcomingReleases(scraper, site, beforeFetchLatest, accSiteReleases), // fetch basic release info from upcoming overview
|
||||
]);
|
||||
|
||||
if (argv.upcoming) {
|
||||
logger.info(`${site.name}: ${argv.latest ? `Found ${newReleases.length}` : 'Ignoring'} latest releases,${argv.upcoming ? ' ' : ' ignoring '}${upcomingReleases.length || '0'} upcoming releases`);
|
||||
}
|
||||
|
||||
const baseReleases = [...newReleases, ...upcomingReleases];
|
||||
|
||||
if (argv.deep) {
|
||||
// follow URL for every release
|
||||
return deepFetchReleases(baseReleases, beforeFetchLatest);
|
||||
}
|
||||
|
||||
return baseReleases;
|
||||
}
|
||||
|
||||
async function scrapeSite(site, network, accSiteReleases = []) {
|
||||
if (site.parameters?.ignore) {
|
||||
logger.warn(`Ignoring ${network.name}: ${site.name}`);
|
||||
return [];
|
||||
}
|
||||
|
||||
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
|
||||
|
||||
if (!scraper) {
|
||||
logger.warn(`No scraper found for '${site.name}' (${site.slug})`);
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
const siteReleases = await scrapeSiteReleases(scraper, site, accSiteReleases);
|
||||
|
||||
return siteReleases.map(release => ({ ...release, site }));
|
||||
} catch (error) {
|
||||
logger.error(`${site.name}: Failed to scrape releases: ${error.message}`);
|
||||
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async function scrapeSites() {
|
||||
const networks = await fetchIncludedSites();
|
||||
|
||||
const scrapedNetworks = await Promise.map(networks, async (network) => {
|
||||
if (network.parameters?.sequential) {
|
||||
logger.info(`Scraping '${network.name}' sequentially`);
|
||||
|
||||
return Promise.reduce(network.sites, async (acc, site) => {
|
||||
const accSiteReleases = await acc;
|
||||
const siteReleases = await scrapeSite(site, network, accSiteReleases);
|
||||
|
||||
return accSiteReleases.concat(siteReleases);
|
||||
}, Promise.resolve([]));
|
||||
}
|
||||
|
||||
return Promise.map(network.sites, async site => scrapeSite(site, network), {
|
||||
concurrency: network.parameters?.concurrency || 2,
|
||||
});
|
||||
},
|
||||
{
|
||||
// 5 networks at a time
|
||||
concurrency: 5,
|
||||
});
|
||||
|
||||
const releases = scrapedNetworks.flat(2);
|
||||
|
||||
if (argv.inspect) {
|
||||
console.log(releases);
|
||||
}
|
||||
|
||||
if (argv.save) {
|
||||
await storeReleases(releases);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = scrapeSites;
|
|
@ -7,7 +7,7 @@ const { fetchScene, fetchLatest, fetchUpcoming, fetchProfile } = require('./gamm
|
|||
async function fetchSceneWrapper(url, site, baseRelease) {
|
||||
const release = await fetchScene(url, site, baseRelease);
|
||||
|
||||
if (site.isFallback && release.channel) {
|
||||
if (site.isNetwork && release.channel) {
|
||||
const channelUrl = url.replace('blowpass.com', `${release.channel}.com`);
|
||||
|
||||
if (['onlyteenblowjobs', 'mommyblowsbest'].includes(release.channel)) {
|
||||
|
|
|
@ -392,9 +392,8 @@ async function fetchProfile(actorName, scraperSlug, site) {
|
|||
: await get(`${site.url}/${t1}models/${actorSlugA}.html`);
|
||||
|
||||
const res = (res1.ok && res1)
|
||||
|| (site.parameters?.profile
|
||||
? await get(util.format(site.parameters.profile, actorSlugB))
|
||||
: await get(`${site.url}/${t1}models/${actorSlugB}.html`));
|
||||
|| (site.parameters?.profile && await get(util.format(site.parameters.profile, actorSlugB)))
|
||||
|| await get(`${site.url}/${t1}models/${actorSlugB}.html`);
|
||||
|
||||
if (!res.ok) return res.status;
|
||||
if (site.parameters?.t1) return scrapeProfileT1(res.item, site);
|
||||
|
|
|
@ -118,7 +118,7 @@ async function fetchScene(url, site) {
|
|||
const res = await bhttp.get(url);
|
||||
|
||||
if (res.statusCode === 200) {
|
||||
if (site.isFallback) {
|
||||
if (site.isNetwork) {
|
||||
const entryId = scrapeFallbackLanding(res.body.toString(), url);
|
||||
|
||||
const fallbackRes = await bhttp.post('https://www.pervcity.com/set_popupvideo.php', {
|
||||
|
|
|
@ -19,7 +19,8 @@ function curateReleaseEntry(release, batchId, existingRelease) {
|
|||
const curatedRelease = {
|
||||
title: release.title,
|
||||
entry_id: release.entryId || null,
|
||||
site_id: release.site.id,
|
||||
site_id: release.site?.id,
|
||||
network_id: release.site ? null : release.network?.id, // prefer site ID if available
|
||||
shoot_id: release.shootId || null,
|
||||
studio_id: release.studio?.id || null,
|
||||
url: release.url,
|
||||
|
@ -45,7 +46,7 @@ function curateReleaseEntry(release, batchId, existingRelease) {
|
|||
}
|
||||
|
||||
async function attachChannelSites(releases) {
|
||||
const releasesWithoutSite = releases.filter(release => release.channel && (!release.site || release.site.isFallback));
|
||||
const releasesWithoutSite = releases.filter(release => release.channel && (!release.site || release.site.isNetwork));
|
||||
|
||||
const channelSites = await knex('sites')
|
||||
.leftJoin('networks', 'networks.id', 'sites.network_id')
|
||||
|
@ -56,7 +57,7 @@ async function attachChannelSites(releases) {
|
|||
|
||||
const releasesWithChannelSite = await Promise.all(releases
|
||||
.map(async (release) => {
|
||||
if (release.site && !release.site.isFallback) {
|
||||
if (release.site && !release.site.isNetwork) {
|
||||
return release;
|
||||
}
|
||||
|
||||
|
@ -69,6 +70,14 @@ async function attachChannelSites(releases) {
|
|||
};
|
||||
}
|
||||
|
||||
if (release.site && release.site.isNetwork) {
|
||||
return {
|
||||
...release,
|
||||
site: null,
|
||||
network: release.site,
|
||||
};
|
||||
}
|
||||
|
||||
logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`);
|
||||
|
||||
return null;
|
||||
|
@ -197,6 +206,10 @@ async function updateReleasesSearch(releaseIds) {
|
|||
}
|
||||
|
||||
async function storeReleases(releases) {
|
||||
if (releases.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
|
||||
|
||||
const releasesWithSites = await attachChannelSites(releases);
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
'use strict';
|
||||
|
||||
const path = require('path');
|
||||
const Promise = require('bluebird');
|
||||
const fs = require('fs-extra');
|
||||
const fetchScene = require('../scrape-releases');
|
||||
|
||||
const argv = require('../argv');
|
||||
|
||||
async function renameFiles() {
|
||||
const filenames = await fs.readdir(process.cwd());
|
||||
|
||||
const curated = await Promise.map(filenames, async (filename) => {
|
||||
const shootId = filename.split(' ')[1];
|
||||
const scene = await fetchScene(`https://kink.com/shoot/${shootId}`);
|
||||
|
||||
if (argv.confirm) {
|
||||
await fs.rename(path.join(process.cwd(), filename), path.join(process.cwd(), `${scene.filename}.mp4`));
|
||||
}
|
||||
|
||||
return scene.filename;
|
||||
}, {
|
||||
concurrency: 5,
|
||||
});
|
||||
|
||||
console.log(curated);
|
||||
}
|
||||
|
||||
renameFiles();
|