Allowing scrapers to return raw tags and site URLs or slugs, to gradually remove site and tag fetching from individual scrapers. Added media and deep fetchin support to Perv City scraper.

This commit is contained in:
ThePendulum 2019-12-05 01:26:22 +01:00
parent 7840af2843
commit 71cb85c3e1
8 changed files with 182 additions and 54 deletions

View File

@ -1,25 +1,5 @@
<template> <template>
<div class="tags"> <div class="tags">
<h3>Penetration</h3>
<div class="tiles">
<Tag
v-for="tag in tags.penetration"
:key="`tag-${tag.id}`"
:tag="tag"
/>
</div>
<h3>Group</h3>
<div class="tiles">
<Tag
v-for="tag in tags.group"
:key="`tag-${tag.id}`"
:tag="tag"
/>
</div>
<h3>Ethnicity</h3> <h3>Ethnicity</h3>
<div class="tiles"> <div class="tiles">
@ -30,6 +10,16 @@
/> />
</div> </div>
<h3>Penetration</h3>
<div class="tiles">
<Tag
v-for="tag in tags.penetration"
:key="`tag-${tag.id}`"
:tag="tag"
/>
</div>
<h3>Finish</h3> <h3>Finish</h3>
<div class="tiles"> <div class="tiles">
@ -39,6 +29,16 @@
:tag="tag" :tag="tag"
/> />
</div> </div>
<h3>Group</h3>
<div class="tiles">
<Tag
v-for="tag in tags.group"
:key="`tag-${tag.id}`"
:tag="tag"
/>
</div>
</div> </div>
</template> </template>

View File

@ -35,7 +35,7 @@
target="_blank" target="_blank"
rel="noopener noreferrer" rel="noopener noreferrer"
class="date" class="date"
>{{ `(${formatDate(release.dateAdded, 'MMM D, YYYY')})` }}`</a> >{{ `(${formatDate(release.dateAdded, 'MMM D, YYYY')})` }}</a>
</span> </span>
<router-link <router-link

View File

@ -309,7 +309,7 @@ function getTags(groupsMap) {
{ {
name: 'triple anal', name: 'triple anal',
slug: 'triple-anal', slug: 'triple-anal',
description: 'Getting fucked in the ass by not one, two but *three* cocks at the same time.', description: 'Getting fucked in the ass by not one, two, but *three* cocks at the same time.',
priority: 7, priority: 7,
alias_for: null, alias_for: null,
group_id: groupsMap.penetration, group_id: groupsMap.penetration,

View File

@ -88,7 +88,7 @@ function getMedia(tagsMap) {
path: 'tags/triple-anal/2.jpeg', path: 'tags/triple-anal/2.jpeg',
target_id: tagsMap['triple-anal'], target_id: tagsMap['triple-anal'],
role: 'photo', role: 'photo',
comment: 'Kira Thorn in GIO1018 for LegalPorno"', comment: 'Kira Thorn in GIO1018 for LegalPorno',
}, },
{ {
path: 'tags/blowbang/poster.jpeg', path: 'tags/blowbang/poster.jpeg',

View File

@ -12,6 +12,7 @@ const {
storePhotos, storePhotos,
storeTrailer, storeTrailer,
} = require('./media'); } = require('./media');
const { fetchSites, findSiteByUrl } = require('./sites');
async function curateRelease(release) { async function curateRelease(release) {
const [actors, tags, media] = await Promise.all([ const [actors, tags, media] = await Promise.all([
@ -91,8 +92,23 @@ function curateReleases(releases) {
return Promise.all(releases.map(async release => curateRelease(release))); return Promise.all(releases.map(async release => curateRelease(release)));
} }
function curateScrapedRelease(release) { async function getChannelSite(release) {
return { try {
const site = await findSiteByUrl(release.channel);
return site || null;
} catch (error) {
const [site] = await fetchSites({
name: release.channel,
slug: release.channel,
});
return site || null;
}
}
async function curateScrapedRelease(release) {
const curatedRelease = {
site_id: release.site.id, site_id: release.site.id,
studio_id: release.studio ? release.studio.id : null, studio_id: release.studio ? release.studio.id : null,
shoot_id: release.shootId || null, shoot_id: release.shootId || null,
@ -108,6 +124,17 @@ function curateScrapedRelease(release) {
rating: release.rating && release.rating.stars && Math.floor(release.rating.stars), rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
deep: Boolean(argv.deep && release.url && !release.upcoming), deep: Boolean(argv.deep && release.url && !release.upcoming),
}; };
if (release.site.isFallback && release.channel) {
const site = await getChannelSite(release);
if (site) {
curatedRelease.site_id = site.id;
return curatedRelease;
}
}
return curatedRelease;
} }
function commonQuery(queryBuilder, { function commonQuery(queryBuilder, {
@ -138,7 +165,9 @@ function commonQuery(queryBuilder, {
.andWhereRaw('tags_associated.release_id = releases.id'); .andWhereRaw('tags_associated.release_id = releases.id');
}) })
.andWhere('date', '>', after) .andWhere('date', '>', after)
.orWhere('releases.created_at', '>', after)
.andWhere('date', '<=', before) .andWhere('date', '<=', before)
.orWhere('releases.created_at', '<=', before)
.orderBy([{ column: 'date', order: 'desc' }, { column: 'created_at', order: 'desc' }]) .orderBy([{ column: 'date', order: 'desc' }, { column: 'created_at', order: 'desc' }])
.limit(limit); .limit(limit);
} }
@ -206,7 +235,7 @@ async function storeReleaseAssets(release, releaseId) {
async function storeRelease(release) { async function storeRelease(release) {
const existingRelease = await knex('releases').where('entry_id', release.entryId).first(); const existingRelease = await knex('releases').where('entry_id', release.entryId).first();
const curatedRelease = curateScrapedRelease(release); const curatedRelease = await curateScrapedRelease(release);
if (existingRelease && !argv.redownload) { if (existingRelease && !argv.redownload) {
return existingRelease.id; return existingRelease.id;
@ -256,6 +285,8 @@ async function storeReleases(releases) {
}); });
const actors = storedReleases.reduce((acc, release) => { const actors = storedReleases.reduce((acc, release) => {
if (!release.actors) return acc;
release.actors.forEach((actor) => { release.actors.forEach((actor) => {
const trimmedActor = actor.trim(); const trimmedActor = actor.trim();
@ -274,6 +305,8 @@ async function storeReleases(releases) {
associateActors(actors, storedReleases), associateActors(actors, storedReleases),
Promise.all(storedReleases.map(async release => storeReleaseAssets(release, release.id))), Promise.all(storedReleases.map(async release => storeReleaseAssets(release, release.id))),
]); ]);
return storedReleases;
} }
module.exports = { module.exports = {

View File

@ -48,9 +48,9 @@ async function scrapeRelease(url, release, deep = false) {
if (!deep && argv.save) { if (!deep && argv.save) {
// don't store release when called by site scraper // don't store release when called by site scraper
const [releaseId] = await storeReleases([scene]); const [storedRelease] = await storeReleases([scene]);
console.log(`http://${config.web.host}:${config.web.port}/scene/${releaseId}`); console.log(`http://${config.web.host}:${config.web.port}/scene/${storedRelease.id}`);
} }
return scene; return scene;

View File

@ -2,9 +2,25 @@
const bhttp = require('bhttp'); const bhttp = require('bhttp');
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const { JSDOM } = require('jsdom');
const moment = require('moment'); const moment = require('moment');
function scrape(html, site) { async function getTrailer(entryId) {
const trailerRes = await bhttp.post('https://www.pervcity.com/gettoken.php', {
setId: entryId,
});
if (trailerRes.statusCode === 200) {
return {
poster: trailerRes.body.TrailerImg,
trailer: trailerRes.body.TrailerPath || trailerRes.body.Trailerfallback,
};
}
return null;
}
function scrapeLatestScene(html, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true }); const $ = cheerio.load(html, { normalizeWhitespace: true });
const entryId = $('li').attr('id'); const entryId = $('li').attr('id');
@ -15,6 +31,9 @@ function scrape(html, site) {
const actors = $('.home_model_name a').toArray().map(element => $(element).text().replace(/,[\u0020\u00A0\u202F]/, '')); // replace weird commas const actors = $('.home_model_name a').toArray().map(element => $(element).text().replace(/,[\u0020\u00A0\u202F]/, '')); // replace weird commas
const date = moment.utc($('.add_date').text(), 'DD-MM-YYYY').toDate(); const date = moment.utc($('.add_date').text(), 'DD-MM-YYYY').toDate();
const poster = $('a:nth-child(2) > img').attr('src');
const photos = $('.sample-picker img').map((index, element) => $(element).attr('src').replace('tourpics', 'trailer')).toArray();
const stars = $('img[src*="/star.png"]') const stars = $('img[src*="/star.png"]')
.toArray() .toArray()
.map(element => $(element).attr('src')) .map(element => $(element).attr('src'))
@ -26,6 +45,8 @@ function scrape(html, site) {
title, title,
actors, actors,
date, date,
poster,
photos,
rating: { rating: {
stars, stars,
}, },
@ -33,17 +54,87 @@ function scrape(html, site) {
}; };
} }
async function scrapeScene(html, url, site) {
const { document } = new JSDOM(html).window;
const release = { url, site };
release.entryId = document.querySelector('input#set_ID').value;
release.title = document.querySelector('title').textContent;
release.description = document.querySelector('.player_data').textContent.trim();
const durationString = document.querySelector('.tag_lineR div:nth-child(2) span').textContent;
const [minutes, seconds] = durationString.match(/\d+/g);
release.duration = Number(minutes) * 60 + Number(seconds);
release.tags = document.querySelector('meta[name="keywords"]').content.split(',');
const { poster, trailer } = await getTrailer(release.entryId);
release.poster = poster;
release.trailer = { src: trailer };
return release;
}
function scrapeFallbackLanding(html) {
const { document } = new JSDOM(html).window;
return document.querySelector('input#set_ID').value;
}
async function scrapeFallbackScene(html, entryId, url, site) {
const { document } = new JSDOM(html).window;
const release = { url, entryId, site };
release.title = document.querySelector('.popup_data_set_head label').textContent;
release.description = document.querySelector('.popup_data_set_des p').textContent.trim();
release.date = moment.utc(document.querySelector('.popup_left_top div span').textContent, 'MM-DD-YYYY').toDate();
release.actors = Array.from(document.querySelectorAll('.popup_data_set_models a'), el => el.textContent);
const { poster, trailer } = await getTrailer(release.entryId);
release.poster = poster;
release.trailer = { src: trailer };
release.channel = document.querySelector('.popup_left_top div img').alt;
return release;
}
async function fetchLatest(site, page = 1) { async function fetchLatest(site, page = 1) {
const res = page === 1 const res = page === 1
? await bhttp.get(`${site.url}/final_latestupdateview.php?limitstart=${(page - 1) * 9}&limitend=9&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`) ? await bhttp.get(`${site.url}/final_latestupdateview.php?limitstart=${(page - 1) * 9}&limitend=9&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`)
: await bhttp.get(`${site.url}/final_load_latestupdate_grid_view.php?limitstart=0&limitend=${(page - 1) * 8 + 1}&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`); : await bhttp.get(`${site.url}/final_load_latestupdate_grid_view.php?limitstart=0&limitend=${(page - 1) * 8 + 1}&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`);
const elements = JSON.parse(res.body.toString()); const elements = JSON.parse(res.body.toString());
const latest = Object.values(elements.total_arr).map(html => scrape(html, site)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php const latest = Object.values(elements.total_arr).map(html => scrapeLatestScene(html, site)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php
return latest; return latest;
} }
async function fetchScene(url, site) {
const res = await bhttp.get(url);
if (res.statusCode === 200) {
if (site.isFallback) {
const entryId = scrapeFallbackLanding(res.body.toString(), url);
const fallbackRes = await bhttp.post('https://www.pervcity.com/set_popupvideo.php', {
setId: entryId,
});
return scrapeFallbackScene(fallbackRes.body.toString(), entryId, url, site);
}
return scrapeScene(res.body.toString(), url, site);
}
return null;
}
module.exports = { module.exports = {
fetchLatest, fetchLatest,
fetchScene,
}; };

View File

@ -33,14 +33,41 @@ function curateTags(tags) {
return Promise.all(tags.map(async tag => curateTag(tag))); return Promise.all(tags.map(async tag => curateTag(tag)));
} }
async function matchTags(rawTags) {
const tags = rawTags
.concat(rawTags.map(tag => tag.toLowerCase()))
.concat(rawTags.map(tag => tag.toUpperCase()));
const tagEntries = await knex('tags')
.pluck('aliases.id')
.whereIn('tags.name', tags)
.where(function where() {
this
.whereNull('tags.alias_for')
.orWhereNull('aliases.alias_for');
})
.join('tags as aliases', function join() {
this
.on('tags.alias_for', 'aliases.id')
.orOn('tags.id', 'aliases.id');
})
.groupBy('aliases.id');
return tagEntries;
}
async function associateTags(release, releaseId) { async function associateTags(release, releaseId) {
if (!release.tags || release.tags.length === 0) { if (!release.tags || release.tags.length === 0) {
console.warn(`No tags available for (${release.site.name}, ${releaseId}}) "${release.title}"`); console.warn(`No tags available for (${release.site.name}, ${releaseId}}) "${release.title}"`);
return; return;
} }
const tags = release.tags.some(tag => typeof tag === 'string')
? await matchTags(release.tags) // scraper returned raw tags
: release.tags; // tags already matched by scraper
try { try {
await knex('tags_associated').insert(release.tags.map(tagId => ({ await knex('tags_associated').insert(tags.map(tagId => ({
tag_id: tagId, tag_id: tagId,
release_id: releaseId, release_id: releaseId,
}))); })));
@ -65,29 +92,6 @@ async function fetchTags(queryObject, groupsQueryObject, limit = 100) {
return curateTags(tags); return curateTags(tags);
} }
async function matchTags(rawTags) {
const tags = rawTags
.concat(rawTags.map(tag => tag.toLowerCase()))
.concat(rawTags.map(tag => tag.toUpperCase()));
const tagEntries = await knex('tags')
.pluck('aliases.id')
.whereIn('tags.name', tags)
.where(function where() {
this
.whereNull('tags.alias_for')
.orWhereNull('aliases.alias_for');
})
.join('tags as aliases', function join() {
this
.on('tags.alias_for', 'aliases.id')
.orOn('tags.id', 'aliases.id');
})
.groupBy('aliases.id');
return tagEntries;
}
module.exports = { module.exports = {
associateTags, associateTags,
fetchTags, fetchTags,