Improved actor extraction for fcuk scraper. Changed 'copyright' to 'credit'. Redused entity page favicon size.

This commit is contained in:
DebaucheryLibrarian
2020-07-15 05:12:29 +02:00
parent c62df2228b
commit 5b886b3917
8 changed files with 18 additions and 10 deletions

View File

@@ -2,6 +2,8 @@
const qu = require('../utils/qu');
// TODO: profile scraping
function scrapeLatestBlog(scenes, channel) {
return scenes.map(({ query }) => {
const release = {};
@@ -18,7 +20,10 @@ function scrapeLatestBlog(scenes, channel) {
release.description = query.text('p');
release.date = query.date('h5 strong, .videos h3', 'MMM. DD, YYYY', /\w+. \d{2}, \d{4}/);
if (!/\band\b/.test(release.title) && new RegExp(release.title).test(release.description)) {
// remove common patterns so only the name is left
const curatedTitle = release.title.replace(/\b(part \d|\banal|bts)\b/gi, '').trim();
if (!/\band\b/.test(curatedTitle) && new RegExp(curatedTitle).test(release.description)) {
// scene title is probably the actor name
release.actors = [release.title];
}
@@ -63,7 +68,7 @@ function scrapeSceneBlog({ query }, url, channel) {
release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)\/(\d+)/).slice(1, 3).join('-');
release.title = query.q('h4 strong, .videos h3', true);
release.title = query.text('h4 strong, .videos h3');
release.description = query.q('#about p, .videos p', true);
const actors = query.urls('a[href*="/girl/"]').map(actorUrl => actorUrl.match(/video-([\w\s]+)/)?.[1]).filter(Boolean);
@@ -98,7 +103,6 @@ function scrapeScene({ query, html }, url, channel) {
};
}
console.log(release);
return release;
}