Using unprint as default, marked to-be-updated scrapers as deprecated. Merging movie tags and movie scene tags for manticore movies table. Removed poster set to null in deep merge; annotate if it has purpose. Refactored Brad Montana scraper.
This commit is contained in:
@@ -205,12 +205,6 @@ module.exports = {
|
||||
fetchLatest,
|
||||
// fetchMovies,
|
||||
fetchProfile,
|
||||
scrapeScene: {
|
||||
scraper: scrapeRelease,
|
||||
unprint: true,
|
||||
},
|
||||
scrapeMovie: {
|
||||
scraper: scrapeRelease,
|
||||
unprint: true,
|
||||
},
|
||||
scrapeScene: scrapeRelease,
|
||||
scrapeMovie: scrapeRelease,
|
||||
};
|
||||
|
||||
@@ -74,8 +74,5 @@ async function fetchLatest(channel, page = 1) {
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
scrapeScene: {
|
||||
scraper: scrapeScene,
|
||||
unprint: true,
|
||||
},
|
||||
scrapeScene,
|
||||
};
|
||||
|
||||
@@ -138,9 +138,6 @@ async function fetchProfile(actor, { channel }) {
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
scrapeScene: {
|
||||
scraper: scrapeScene,
|
||||
unprint: true,
|
||||
},
|
||||
scrapeScene,
|
||||
fetchProfile,
|
||||
};
|
||||
|
||||
@@ -208,12 +208,6 @@ async function fetchProfile({ name: actorName, url: actorUrl }, { entity, includ
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchProfile,
|
||||
scrapeScene: {
|
||||
scraper: scrapeScene,
|
||||
unprint: true,
|
||||
},
|
||||
scrapeMovie: {
|
||||
scraper: scrapeMovie,
|
||||
unprint: true,
|
||||
},
|
||||
scrapeScene,
|
||||
scrapeMovie,
|
||||
};
|
||||
|
||||
@@ -225,5 +225,4 @@ module.exports = {
|
||||
fetchUpcoming,
|
||||
fetchProfile,
|
||||
scrapeScene,
|
||||
useUnprint: true,
|
||||
};
|
||||
|
||||
@@ -234,7 +234,6 @@ module.exports = {
|
||||
fetchProfile,
|
||||
scrapeScene: {
|
||||
scraper: scrapeScene,
|
||||
unprint: true,
|
||||
parser: {
|
||||
runScripts: 'dangerously',
|
||||
},
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
'use strict';
|
||||
|
||||
const qu = require('../utils/q');
|
||||
const unprint = require('unprint');
|
||||
|
||||
const slugify = require('../utils/slugify');
|
||||
|
||||
function genderFromUrl(url) {
|
||||
@@ -20,18 +21,21 @@ function genderFromUrl(url) {
|
||||
function scrapeAll(scenes) {
|
||||
return scenes.map(({ query }) => {
|
||||
const release = {};
|
||||
const subtitle = query.cnt('.subtitle');
|
||||
|
||||
release.url = query.url('a');
|
||||
release.url = query.url(null);
|
||||
release.entryId = new URL(release.url).pathname.match(/\/videos\/([\w-]+)/)[1];
|
||||
|
||||
release.title = query.cnt('.title') || query.q('img', 'title');
|
||||
release.actors = subtitle.slice(subtitle.indexOf(':') + 1).split(',').map((actor) => actor.trim()).filter(Boolean);
|
||||
release.title = query.attribute('img', 'title') || query.content('.font-semibold');
|
||||
|
||||
release.poster = query.img('.thumb img');
|
||||
const poster = query.img('img[src*="/uploads"]');
|
||||
|
||||
if (release.poster) {
|
||||
const match = release.poster.match(/\/uploads\/(\d{4})\/(\d{2})/);
|
||||
if (poster) {
|
||||
release.poster = [
|
||||
poster.replace(/-\d+x\d+/, ''),
|
||||
poster,
|
||||
];
|
||||
|
||||
const match = poster.match(/\/uploads\/(\d{4})\/(\d{2})/);
|
||||
|
||||
if (match) {
|
||||
release.date = new Date(match[1], match[2] - 1, 1);
|
||||
@@ -43,30 +47,34 @@ function scrapeAll(scenes) {
|
||||
});
|
||||
}
|
||||
|
||||
function scrapeScene({ query, html }, url, channel) {
|
||||
function scrapeScene({ query, html }, { url, entity }) {
|
||||
const release = {};
|
||||
|
||||
const dataString = query.html('.yoast-schema-graph');
|
||||
const data = dataString && JSON.parse(dataString)['@graph'];
|
||||
const pageData = data.find((item) => item['@type'] === 'WebPage');
|
||||
const imageData = data.find((item) => item['@type'] === 'ImageObject');
|
||||
const data = query.json('.yoast-schema-graph')?.['@graph'];
|
||||
|
||||
const pageData = data?.find((item) => item['@type'] === 'WebPage');
|
||||
const imageData = data?.find((item) => item['@type'] === 'ImageObject');
|
||||
|
||||
release.entryId = new URL(url).pathname.match(/\/videos\/([\w-]+)/)[1];
|
||||
|
||||
release.title = query.cnt('.video .title h1')
|
||||
|| data.find((item) => item['@type'] === 'BreadcrumbList')?.itemListElement.slice(-1)[0].item.name
|
||||
|| pageData?.name.slice(0, pageData.name.lastIndexOf('-')).trim();
|
||||
release.title = query.content('.w-screen + div .font-semibold')
|
||||
|| data?.find((item) => item['@type'] === 'BreadcrumbList')?.itemListElement.slice(-1)[0].item?.name
|
||||
|| pageData?.name.slice(0, pageData?.name.lastIndexOf('-')).trim();
|
||||
|
||||
release.description = query.cnt('.video .descript');
|
||||
release.description = query.content('.leading-relaxed');
|
||||
release.date = pageData?.datePublished && new Date(pageData.datePublished);
|
||||
|
||||
release.date = pageData.datePublished && new Date(pageData.datePublished);
|
||||
|
||||
release.actors = query.all('.video .elenco a').map((el) => {
|
||||
const actorUrl = query.url(el, null);
|
||||
release.actors = query.elements('.models-slider-single a').map((el) => {
|
||||
const actorUrl = unprint.query.url(el, null);
|
||||
const avatarUrl = unprint.query.img(el);
|
||||
|
||||
return {
|
||||
name: query.cnt(el),
|
||||
name: unprint.query.content(el),
|
||||
url: actorUrl,
|
||||
avatar: [
|
||||
avatarUrl?.replace(/-\d+x\d+/, ''),
|
||||
avatarUrl,
|
||||
],
|
||||
gender: genderFromUrl(actorUrl),
|
||||
};
|
||||
});
|
||||
@@ -75,11 +83,8 @@ function scrapeScene({ query, html }, url, channel) {
|
||||
|| query.meta('property="og:image"')
|
||||
|| html.match(/poster: '(http.*\.jpg)'/)?.[1];
|
||||
|
||||
release.photos = query.imgs('.listPostSm a', 'href');
|
||||
release.trailer = query.video('source', 'src', { origin: channel.url });
|
||||
|
||||
release.likes = query.number('.vortex-p-like-counter');
|
||||
release.dislikes = query.number('.vortex-p-dislike-counter');
|
||||
release.photos = query.imgs('.gallery img');
|
||||
release.trailer = query.video('source', 'src', { origin: entity.url });
|
||||
|
||||
if (!release.date && release.poster) {
|
||||
const match = release.poster.match(/\/uploads\/(\d{4})\/(\d{2})/);
|
||||
@@ -93,38 +98,42 @@ function scrapeScene({ query, html }, url, channel) {
|
||||
return release;
|
||||
}
|
||||
|
||||
function scrapeProfile({ query, el }, entity, url) {
|
||||
function scrapeProfile({ query }, entity, url) {
|
||||
const profile = { url };
|
||||
const data = query.json('.yoast-schema-graph');
|
||||
|
||||
profile.gender = genderFromUrl(url);
|
||||
|
||||
profile.description = query.cnt('.about')?.replace(/sobre a atriz:/i, '').trim();
|
||||
profile.avatar = query.img('.left .thumb img');
|
||||
|
||||
profile.scenes = scrapeAll(qu.initAll(el, '.listPostLg .post'));
|
||||
if (data) {
|
||||
profile.avatar = data['@graph']?.find((item) => item['@type'] === 'ImageObject')?.url;
|
||||
}
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page = 1) {
|
||||
const url = `${channel.url}/videos/page/${page}`;
|
||||
const res = await qu.getAll(url, '.listPostLg .post');
|
||||
const res = await unprint.get(url, { selectAll: '.grid > a[href*="/videos"]' });
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.items, channel);
|
||||
return scrapeAll(res.context, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchProfilePage({ name, gender }, entity, secondAttempt) {
|
||||
const url = `${entity.url}/${gender === 'male' || secondAttempt ? 'atores' : 'atrizes'}/${slugify(name, '-')}`;
|
||||
const res = await qu.get(url);
|
||||
async function fetchProfilePage({ name, gender, url: actorUrl }, entity, secondAttempt) {
|
||||
const url = actorUrl || `${entity.url}/${gender === 'male' || secondAttempt ? 'atores' : 'atrizes'}/${slugify(name, '-')}`;
|
||||
const res = await unprint.get(url);
|
||||
|
||||
if (res.ok) {
|
||||
return { res, url };
|
||||
}
|
||||
|
||||
if (actorUrl) {
|
||||
return fetchProfilePage({ name, gender }, entity, false); // don't count as second attempt, retry without actor URL
|
||||
}
|
||||
|
||||
if (secondAttempt) {
|
||||
return res.status;
|
||||
}
|
||||
@@ -136,7 +145,7 @@ async function fetchProfile(baseActor, entity, options) {
|
||||
const { res, url } = await fetchProfilePage(baseActor, entity, false);
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeProfile(res.item, entity, url, options);
|
||||
return scrapeProfile(res.context, entity, url, options);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
|
||||
@@ -75,5 +75,4 @@ async function fetchLatest(channel, page = 1) {
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
scrapeScene,
|
||||
useUnprint: true,
|
||||
};
|
||||
|
||||
@@ -981,4 +981,5 @@ module.exports = {
|
||||
scrapeAll,
|
||||
scrapeMovie,
|
||||
scrapeScene,
|
||||
deprecated: true,
|
||||
};
|
||||
|
||||
@@ -93,8 +93,5 @@ async function fetchProfile({ name: actorName }, entity, include) {
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchProfile,
|
||||
scrapeScene: {
|
||||
scraper: scrapeScene,
|
||||
unprint: true,
|
||||
},
|
||||
scrapeScene,
|
||||
};
|
||||
|
||||
@@ -351,8 +351,5 @@ module.exports = {
|
||||
fetchMovie,
|
||||
fetchProfile,
|
||||
fetchUpcoming,
|
||||
scrapeScene: {
|
||||
scraper: scrapeScene,
|
||||
unprint: true,
|
||||
},
|
||||
scrapeScene,
|
||||
};
|
||||
|
||||
@@ -95,4 +95,5 @@ module.exports = {
|
||||
fetchLatest,
|
||||
fetchProfile,
|
||||
scrapeScene,
|
||||
deprecated: true,
|
||||
};
|
||||
|
||||
@@ -151,4 +151,5 @@ module.exports = {
|
||||
fetchLatest,
|
||||
fetchProfile,
|
||||
scrapeScene,
|
||||
deprecated: true,
|
||||
};
|
||||
|
||||
@@ -88,9 +88,6 @@ async function fetchProfile(actor) {
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
scrapeScene: {
|
||||
scraper: scrapeScene,
|
||||
unprint: true,
|
||||
},
|
||||
scrapeScene,
|
||||
fetchProfile,
|
||||
};
|
||||
|
||||
@@ -87,8 +87,5 @@ async function fetchProfile({ name }, entity) {
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchProfile,
|
||||
scrapeScene: {
|
||||
scraper: scrapeScene,
|
||||
unprint: true,
|
||||
},
|
||||
scrapeScene,
|
||||
};
|
||||
|
||||
@@ -130,8 +130,5 @@ async function fetchProfile({ slug }, { channel }) {
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchProfile,
|
||||
scrapeScene: {
|
||||
scraper: scrapeScene,
|
||||
unprint: true,
|
||||
},
|
||||
scrapeScene,
|
||||
};
|
||||
|
||||
@@ -158,10 +158,8 @@ async function fetchLatestBlock(site, page) {
|
||||
module.exports = {
|
||||
fetchLatest: fetchLatestClassic,
|
||||
scrapeScene: scrapeSceneClassic,
|
||||
useUnprint: true,
|
||||
block: {
|
||||
scrapeScene: scrapeSceneBlock,
|
||||
fetchLatest: fetchLatestBlock,
|
||||
useUnprint: true,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -160,4 +160,5 @@ module.exports = {
|
||||
fetchUpcoming,
|
||||
fetchProfile,
|
||||
scrapeScene,
|
||||
deprecated: true,
|
||||
};
|
||||
|
||||
@@ -150,4 +150,5 @@ module.exports = {
|
||||
fetchLatest,
|
||||
scrapeScene,
|
||||
fetchProfile,
|
||||
deprecated: true,
|
||||
};
|
||||
|
||||
@@ -169,4 +169,5 @@ module.exports = {
|
||||
fetchUpcoming,
|
||||
scrapeAll,
|
||||
scrapeScene,
|
||||
deprecated: true,
|
||||
};
|
||||
|
||||
@@ -168,4 +168,5 @@ module.exports = {
|
||||
fetchLatest,
|
||||
fetchProfile,
|
||||
scrapeScene,
|
||||
deprecated: true,
|
||||
};
|
||||
|
||||
@@ -118,8 +118,5 @@ async function fetchProfile(actor, entity, include) {
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchProfile,
|
||||
scrapeScene: {
|
||||
scraper: scrapeScene,
|
||||
unprint: true,
|
||||
},
|
||||
scrapeScene,
|
||||
};
|
||||
|
||||
@@ -104,8 +104,5 @@ async function fetchProfile(actor, { entity }) {
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchProfile,
|
||||
scrapeScene: {
|
||||
scraper: scrapeScene,
|
||||
unprint: true,
|
||||
},
|
||||
scrapeScene,
|
||||
};
|
||||
|
||||
@@ -174,4 +174,5 @@ module.exports = {
|
||||
fetchLatest,
|
||||
scrapeScene,
|
||||
fetchProfile,
|
||||
deprecated: true,
|
||||
};
|
||||
|
||||
@@ -86,8 +86,5 @@ async function fetchLatest(channel, page = 1) {
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
scrapeScene: {
|
||||
scraper: scrapeScene,
|
||||
unprint: true,
|
||||
},
|
||||
scrapeScene,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user