forked from DebaucheryLibrarian/traxxx
Allowing scrapers to return raw tags and site URLs or slugs, to gradually remove site and tag fetching from individual scrapers. Added media and deep fetchin support to Perv City scraper.
This commit is contained in:
parent
7840af2843
commit
71cb85c3e1
|
@ -1,25 +1,5 @@
|
|||
<template>
|
||||
<div class="tags">
|
||||
<h3>Penetration</h3>
|
||||
|
||||
<div class="tiles">
|
||||
<Tag
|
||||
v-for="tag in tags.penetration"
|
||||
:key="`tag-${tag.id}`"
|
||||
:tag="tag"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<h3>Group</h3>
|
||||
|
||||
<div class="tiles">
|
||||
<Tag
|
||||
v-for="tag in tags.group"
|
||||
:key="`tag-${tag.id}`"
|
||||
:tag="tag"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<h3>Ethnicity</h3>
|
||||
|
||||
<div class="tiles">
|
||||
|
@ -30,6 +10,16 @@
|
|||
/>
|
||||
</div>
|
||||
|
||||
<h3>Penetration</h3>
|
||||
|
||||
<div class="tiles">
|
||||
<Tag
|
||||
v-for="tag in tags.penetration"
|
||||
:key="`tag-${tag.id}`"
|
||||
:tag="tag"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<h3>Finish</h3>
|
||||
|
||||
<div class="tiles">
|
||||
|
@ -39,6 +29,16 @@
|
|||
:tag="tag"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<h3>Group</h3>
|
||||
|
||||
<div class="tiles">
|
||||
<Tag
|
||||
v-for="tag in tags.group"
|
||||
:key="`tag-${tag.id}`"
|
||||
:tag="tag"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
class="date"
|
||||
>{{ `(${formatDate(release.dateAdded, 'MMM D, YYYY')})` }}`</a>
|
||||
>{{ `(${formatDate(release.dateAdded, 'MMM D, YYYY')})` }}</a>
|
||||
</span>
|
||||
|
||||
<router-link
|
||||
|
|
|
@ -309,7 +309,7 @@ function getTags(groupsMap) {
|
|||
{
|
||||
name: 'triple anal',
|
||||
slug: 'triple-anal',
|
||||
description: 'Getting fucked in the ass by not one, two but *three* cocks at the same time.',
|
||||
description: 'Getting fucked in the ass by not one, two, but *three* cocks at the same time.',
|
||||
priority: 7,
|
||||
alias_for: null,
|
||||
group_id: groupsMap.penetration,
|
||||
|
|
|
@ -88,7 +88,7 @@ function getMedia(tagsMap) {
|
|||
path: 'tags/triple-anal/2.jpeg',
|
||||
target_id: tagsMap['triple-anal'],
|
||||
role: 'photo',
|
||||
comment: 'Kira Thorn in GIO1018 for LegalPorno"',
|
||||
comment: 'Kira Thorn in GIO1018 for LegalPorno',
|
||||
},
|
||||
{
|
||||
path: 'tags/blowbang/poster.jpeg',
|
||||
|
|
|
@ -12,6 +12,7 @@ const {
|
|||
storePhotos,
|
||||
storeTrailer,
|
||||
} = require('./media');
|
||||
const { fetchSites, findSiteByUrl } = require('./sites');
|
||||
|
||||
async function curateRelease(release) {
|
||||
const [actors, tags, media] = await Promise.all([
|
||||
|
@ -91,8 +92,23 @@ function curateReleases(releases) {
|
|||
return Promise.all(releases.map(async release => curateRelease(release)));
|
||||
}
|
||||
|
||||
function curateScrapedRelease(release) {
|
||||
return {
|
||||
async function getChannelSite(release) {
|
||||
try {
|
||||
const site = await findSiteByUrl(release.channel);
|
||||
|
||||
return site || null;
|
||||
} catch (error) {
|
||||
const [site] = await fetchSites({
|
||||
name: release.channel,
|
||||
slug: release.channel,
|
||||
});
|
||||
|
||||
return site || null;
|
||||
}
|
||||
}
|
||||
|
||||
async function curateScrapedRelease(release) {
|
||||
const curatedRelease = {
|
||||
site_id: release.site.id,
|
||||
studio_id: release.studio ? release.studio.id : null,
|
||||
shoot_id: release.shootId || null,
|
||||
|
@ -108,6 +124,17 @@ function curateScrapedRelease(release) {
|
|||
rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
|
||||
deep: Boolean(argv.deep && release.url && !release.upcoming),
|
||||
};
|
||||
|
||||
if (release.site.isFallback && release.channel) {
|
||||
const site = await getChannelSite(release);
|
||||
|
||||
if (site) {
|
||||
curatedRelease.site_id = site.id;
|
||||
return curatedRelease;
|
||||
}
|
||||
}
|
||||
|
||||
return curatedRelease;
|
||||
}
|
||||
|
||||
function commonQuery(queryBuilder, {
|
||||
|
@ -138,7 +165,9 @@ function commonQuery(queryBuilder, {
|
|||
.andWhereRaw('tags_associated.release_id = releases.id');
|
||||
})
|
||||
.andWhere('date', '>', after)
|
||||
.orWhere('releases.created_at', '>', after)
|
||||
.andWhere('date', '<=', before)
|
||||
.orWhere('releases.created_at', '<=', before)
|
||||
.orderBy([{ column: 'date', order: 'desc' }, { column: 'created_at', order: 'desc' }])
|
||||
.limit(limit);
|
||||
}
|
||||
|
@ -206,7 +235,7 @@ async function storeReleaseAssets(release, releaseId) {
|
|||
|
||||
async function storeRelease(release) {
|
||||
const existingRelease = await knex('releases').where('entry_id', release.entryId).first();
|
||||
const curatedRelease = curateScrapedRelease(release);
|
||||
const curatedRelease = await curateScrapedRelease(release);
|
||||
|
||||
if (existingRelease && !argv.redownload) {
|
||||
return existingRelease.id;
|
||||
|
@ -256,6 +285,8 @@ async function storeReleases(releases) {
|
|||
});
|
||||
|
||||
const actors = storedReleases.reduce((acc, release) => {
|
||||
if (!release.actors) return acc;
|
||||
|
||||
release.actors.forEach((actor) => {
|
||||
const trimmedActor = actor.trim();
|
||||
|
||||
|
@ -274,6 +305,8 @@ async function storeReleases(releases) {
|
|||
associateActors(actors, storedReleases),
|
||||
Promise.all(storedReleases.map(async release => storeReleaseAssets(release, release.id))),
|
||||
]);
|
||||
|
||||
return storedReleases;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
|
|
|
@ -48,9 +48,9 @@ async function scrapeRelease(url, release, deep = false) {
|
|||
|
||||
if (!deep && argv.save) {
|
||||
// don't store release when called by site scraper
|
||||
const [releaseId] = await storeReleases([scene]);
|
||||
const [storedRelease] = await storeReleases([scene]);
|
||||
|
||||
console.log(`http://${config.web.host}:${config.web.port}/scene/${releaseId}`);
|
||||
console.log(`http://${config.web.host}:${config.web.port}/scene/${storedRelease.id}`);
|
||||
}
|
||||
|
||||
return scene;
|
||||
|
|
|
@ -2,9 +2,25 @@
|
|||
|
||||
const bhttp = require('bhttp');
|
||||
const cheerio = require('cheerio');
|
||||
const { JSDOM } = require('jsdom');
|
||||
const moment = require('moment');
|
||||
|
||||
function scrape(html, site) {
|
||||
async function getTrailer(entryId) {
|
||||
const trailerRes = await bhttp.post('https://www.pervcity.com/gettoken.php', {
|
||||
setId: entryId,
|
||||
});
|
||||
|
||||
if (trailerRes.statusCode === 200) {
|
||||
return {
|
||||
poster: trailerRes.body.TrailerImg,
|
||||
trailer: trailerRes.body.TrailerPath || trailerRes.body.Trailerfallback,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function scrapeLatestScene(html, site) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
|
||||
const entryId = $('li').attr('id');
|
||||
|
@ -15,6 +31,9 @@ function scrape(html, site) {
|
|||
const actors = $('.home_model_name a').toArray().map(element => $(element).text().replace(/,[\u0020\u00A0\u202F]/, '')); // replace weird commas
|
||||
const date = moment.utc($('.add_date').text(), 'DD-MM-YYYY').toDate();
|
||||
|
||||
const poster = $('a:nth-child(2) > img').attr('src');
|
||||
const photos = $('.sample-picker img').map((index, element) => $(element).attr('src').replace('tourpics', 'trailer')).toArray();
|
||||
|
||||
const stars = $('img[src*="/star.png"]')
|
||||
.toArray()
|
||||
.map(element => $(element).attr('src'))
|
||||
|
@ -26,6 +45,8 @@ function scrape(html, site) {
|
|||
title,
|
||||
actors,
|
||||
date,
|
||||
poster,
|
||||
photos,
|
||||
rating: {
|
||||
stars,
|
||||
},
|
||||
|
@ -33,17 +54,87 @@ function scrape(html, site) {
|
|||
};
|
||||
}
|
||||
|
||||
async function scrapeScene(html, url, site) {
|
||||
const { document } = new JSDOM(html).window;
|
||||
|
||||
const release = { url, site };
|
||||
|
||||
release.entryId = document.querySelector('input#set_ID').value;
|
||||
|
||||
release.title = document.querySelector('title').textContent;
|
||||
release.description = document.querySelector('.player_data').textContent.trim();
|
||||
|
||||
const durationString = document.querySelector('.tag_lineR div:nth-child(2) span').textContent;
|
||||
const [minutes, seconds] = durationString.match(/\d+/g);
|
||||
|
||||
release.duration = Number(minutes) * 60 + Number(seconds);
|
||||
release.tags = document.querySelector('meta[name="keywords"]').content.split(',');
|
||||
|
||||
const { poster, trailer } = await getTrailer(release.entryId);
|
||||
|
||||
release.poster = poster;
|
||||
release.trailer = { src: trailer };
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
function scrapeFallbackLanding(html) {
|
||||
const { document } = new JSDOM(html).window;
|
||||
|
||||
return document.querySelector('input#set_ID').value;
|
||||
}
|
||||
|
||||
async function scrapeFallbackScene(html, entryId, url, site) {
|
||||
const { document } = new JSDOM(html).window;
|
||||
const release = { url, entryId, site };
|
||||
|
||||
release.title = document.querySelector('.popup_data_set_head label').textContent;
|
||||
release.description = document.querySelector('.popup_data_set_des p').textContent.trim();
|
||||
release.date = moment.utc(document.querySelector('.popup_left_top div span').textContent, 'MM-DD-YYYY').toDate();
|
||||
release.actors = Array.from(document.querySelectorAll('.popup_data_set_models a'), el => el.textContent);
|
||||
|
||||
const { poster, trailer } = await getTrailer(release.entryId);
|
||||
|
||||
release.poster = poster;
|
||||
release.trailer = { src: trailer };
|
||||
|
||||
release.channel = document.querySelector('.popup_left_top div img').alt;
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function fetchLatest(site, page = 1) {
|
||||
const res = page === 1
|
||||
? await bhttp.get(`${site.url}/final_latestupdateview.php?limitstart=${(page - 1) * 9}&limitend=9&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`)
|
||||
: await bhttp.get(`${site.url}/final_load_latestupdate_grid_view.php?limitstart=0&limitend=${(page - 1) * 8 + 1}&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`);
|
||||
const elements = JSON.parse(res.body.toString());
|
||||
|
||||
const latest = Object.values(elements.total_arr).map(html => scrape(html, site)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php
|
||||
const latest = Object.values(elements.total_arr).map(html => scrapeLatestScene(html, site)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php
|
||||
|
||||
return latest;
|
||||
}
|
||||
|
||||
async function fetchScene(url, site) {
|
||||
const res = await bhttp.get(url);
|
||||
|
||||
if (res.statusCode === 200) {
|
||||
if (site.isFallback) {
|
||||
const entryId = scrapeFallbackLanding(res.body.toString(), url);
|
||||
|
||||
const fallbackRes = await bhttp.post('https://www.pervcity.com/set_popupvideo.php', {
|
||||
setId: entryId,
|
||||
});
|
||||
|
||||
return scrapeFallbackScene(fallbackRes.body.toString(), entryId, url, site);
|
||||
}
|
||||
|
||||
return scrapeScene(res.body.toString(), url, site);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchScene,
|
||||
};
|
||||
|
|
52
src/tags.js
52
src/tags.js
|
@ -33,14 +33,41 @@ function curateTags(tags) {
|
|||
return Promise.all(tags.map(async tag => curateTag(tag)));
|
||||
}
|
||||
|
||||
async function matchTags(rawTags) {
|
||||
const tags = rawTags
|
||||
.concat(rawTags.map(tag => tag.toLowerCase()))
|
||||
.concat(rawTags.map(tag => tag.toUpperCase()));
|
||||
|
||||
const tagEntries = await knex('tags')
|
||||
.pluck('aliases.id')
|
||||
.whereIn('tags.name', tags)
|
||||
.where(function where() {
|
||||
this
|
||||
.whereNull('tags.alias_for')
|
||||
.orWhereNull('aliases.alias_for');
|
||||
})
|
||||
.join('tags as aliases', function join() {
|
||||
this
|
||||
.on('tags.alias_for', 'aliases.id')
|
||||
.orOn('tags.id', 'aliases.id');
|
||||
})
|
||||
.groupBy('aliases.id');
|
||||
|
||||
return tagEntries;
|
||||
}
|
||||
|
||||
async function associateTags(release, releaseId) {
|
||||
if (!release.tags || release.tags.length === 0) {
|
||||
console.warn(`No tags available for (${release.site.name}, ${releaseId}}) "${release.title}"`);
|
||||
return;
|
||||
}
|
||||
|
||||
const tags = release.tags.some(tag => typeof tag === 'string')
|
||||
? await matchTags(release.tags) // scraper returned raw tags
|
||||
: release.tags; // tags already matched by scraper
|
||||
|
||||
try {
|
||||
await knex('tags_associated').insert(release.tags.map(tagId => ({
|
||||
await knex('tags_associated').insert(tags.map(tagId => ({
|
||||
tag_id: tagId,
|
||||
release_id: releaseId,
|
||||
})));
|
||||
|
@ -65,29 +92,6 @@ async function fetchTags(queryObject, groupsQueryObject, limit = 100) {
|
|||
return curateTags(tags);
|
||||
}
|
||||
|
||||
async function matchTags(rawTags) {
|
||||
const tags = rawTags
|
||||
.concat(rawTags.map(tag => tag.toLowerCase()))
|
||||
.concat(rawTags.map(tag => tag.toUpperCase()));
|
||||
|
||||
const tagEntries = await knex('tags')
|
||||
.pluck('aliases.id')
|
||||
.whereIn('tags.name', tags)
|
||||
.where(function where() {
|
||||
this
|
||||
.whereNull('tags.alias_for')
|
||||
.orWhereNull('aliases.alias_for');
|
||||
})
|
||||
.join('tags as aliases', function join() {
|
||||
this
|
||||
.on('tags.alias_for', 'aliases.id')
|
||||
.orOn('tags.id', 'aliases.id');
|
||||
})
|
||||
.groupBy('aliases.id');
|
||||
|
||||
return tagEntries;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
associateTags,
|
||||
fetchTags,
|
||||
|
|
Loading…
Reference in New Issue