Improved actor extraction for fcuk scraper. Changed 'copyright' to 'credit'. Redused entity page favicon size.
This commit is contained in:
@@ -2,6 +2,8 @@
|
||||
|
||||
const qu = require('../utils/qu');
|
||||
|
||||
// TODO: profile scraping
|
||||
|
||||
function scrapeLatestBlog(scenes, channel) {
|
||||
return scenes.map(({ query }) => {
|
||||
const release = {};
|
||||
@@ -18,7 +20,10 @@ function scrapeLatestBlog(scenes, channel) {
|
||||
release.description = query.text('p');
|
||||
release.date = query.date('h5 strong, .videos h3', 'MMM. DD, YYYY', /\w+. \d{2}, \d{4}/);
|
||||
|
||||
if (!/\band\b/.test(release.title) && new RegExp(release.title).test(release.description)) {
|
||||
// remove common patterns so only the name is left
|
||||
const curatedTitle = release.title.replace(/\b(part \d|\banal|bts)\b/gi, '').trim();
|
||||
|
||||
if (!/\band\b/.test(curatedTitle) && new RegExp(curatedTitle).test(release.description)) {
|
||||
// scene title is probably the actor name
|
||||
release.actors = [release.title];
|
||||
}
|
||||
@@ -63,7 +68,7 @@ function scrapeSceneBlog({ query }, url, channel) {
|
||||
|
||||
release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)\/(\d+)/).slice(1, 3).join('-');
|
||||
|
||||
release.title = query.q('h4 strong, .videos h3', true);
|
||||
release.title = query.text('h4 strong, .videos h3');
|
||||
release.description = query.q('#about p, .videos p', true);
|
||||
|
||||
const actors = query.urls('a[href*="/girl/"]').map(actorUrl => actorUrl.match(/video-([\w\s]+)/)?.[1]).filter(Boolean);
|
||||
@@ -98,7 +103,6 @@ function scrapeScene({ query, html }, url, channel) {
|
||||
};
|
||||
}
|
||||
|
||||
console.log(release);
|
||||
return release;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user