Added series as channels with logos and photo album scraping to Little Caprice. Added various tag photos.

This commit is contained in:
DebaucheryLibrarian
2020-11-24 04:29:44 +01:00
parent 711a9441a6
commit 980efbc93d
66 changed files with 172 additions and 19 deletions

View File

@@ -57,7 +57,7 @@ function toBaseReleases(baseReleasesOrUrls, entity = null) {
// base release with URL
return {
...baseReleaseOrUrl,
entity,
entity: baseReleaseOrUrl.entity || entity,
deep: false,
};
}
@@ -75,7 +75,7 @@ function toBaseReleases(baseReleasesOrUrls, entity = null) {
// base release without URL, prepare for passthrough
return {
...baseReleaseOrUrl,
entity,
entity: baseReleaseOrUrl.entity || entity,
deep: false,
};
}

View File

@@ -39,6 +39,16 @@ function curateEntity(entity, includeParameters = false) {
}, includeParameters));
}
if (entity.siblings) {
curatedEntity.parent = {
...curatedEntity.parent,
children: entity.siblings.map(sibling => curateEntity({
...sibling,
parent: curatedEntity.parent,
}, includeParameters)),
};
}
if (entity.tags) {
curatedEntity.tags = entity.tags.map(tag => ({
id: tag.id,
@@ -72,9 +82,10 @@ async function fetchIncludedEntities() {
WITH RECURSIVE channels AS (
/* select configured channels and networks */
SELECT
entities.*
entities.*, json_agg(siblings) as siblings
FROM
entities
LEFT JOIN entities AS siblings ON siblings.parent_id = entities.parent_id
WHERE
CASE WHEN :includeAll
THEN
@@ -91,12 +102,13 @@ async function fetchIncludedEntities() {
AND entities.type = 'network')
OR (entities.slug = ANY(:excludedChannels)
AND entities.type = 'channel'))
GROUP BY entities.id
UNION ALL
/* select recursive children of configured networks */
SELECT
entities.*
entities.*, null as siblings
FROM
entities
INNER JOIN
@@ -117,7 +129,7 @@ async function fetchIncludedEntities() {
WHERE
channels.type = 'channel'
GROUP BY
entities.id
entities.id;
`, include);
const curatedNetworks = rawNetworks.rows.map(entity => curateEntity(entity, true));

View File

@@ -1,8 +1,33 @@
'use strict';
const qu = require('../utils/qu');
const slugify = require('../utils/slugify');
function scrapeAll(scenes) {
function matchChannel(release, channel) {
const series = channel.children || channel.parent.children;
console.log(channel, series);
const serieNames = series.reduce((acc, serie) => ({
...acc,
[serie.name]: serie,
[serie.slug]: serie,
}), {});
const serieName = release.title.match(new RegExp(Object.keys(serieNames).join('|'), 'i'))?.[0];
const serie = serieName && serieNames[slugify(serieName, '')];
if (serie) {
return {
channel: serie.slug,
title: release.title.replace(new RegExp(`${serieName}[\\s:-]*`), ''),
};
}
return null;
}
function scrapeAll(scenes, channel) {
return scenes.map(({ query, el }) => {
const release = {};
@@ -14,11 +39,29 @@ function scrapeAll(scenes) {
release.poster = query.img('img');
return release;
return {
...release,
...matchChannel(release, channel),
};
});
}
function scrapeScene({ query }) {
async function fetchPhotos(url) {
if (url) {
const res = await qu.get(url, '.et_post_gallery');
if (res.ok) {
return res.item.query.urls('a').map(imgUrl => ({
src: imgUrl,
referer: url,
}));
}
}
return null;
}
async function scrapeScene({ query }, url, channel, include) {
const release = {};
const script = query.cnt('script.yoast-schema-graph');
@@ -41,20 +84,31 @@ function scrapeScene({ query }) {
const posterData = data['@graph']?.find(item => item['@type'] === 'ImageObject');
release.poster = posterData?.url
const poster = posterData?.url
|| query.q('meta[property="og:image"]', 'content')
|| query.q('meta[name="twitter:image"]', 'content');
release.poster = {
src: poster,
referer: url,
};
release.stars = Math.min(Number(query.q('.post-ratings-image', 'title')?.match(/average:\s*(\d\.\d+)/)?.[1]), 5) || null; // rating out of 5, yet sometimes 5.07?
// TODO: photo gallery, find if any video has a trailer
console.log(release);
return release;
if (include.photos) {
release.photos = await fetchPhotos(query.url('.vid_buttons a[href*="project/"]'));
}
return {
...release,
...matchChannel(release, channel),
};
}
async function fetchLatest(channel) {
// no apparent pagination, all updates on one page
const res = await qu.getAll(`${channel.url}/videos/`, '.project');
// using channels in part because main overview contains indistinguishable photo albums
const res = await qu.getAll(channel.url, '.project');
if (res.ok) {
return scrapeAll(res.items, channel);
@@ -63,11 +117,11 @@ async function fetchLatest(channel) {
return res.status;
}
async function fetchScene(url, channel) {
async function fetchScene(url, channel, baseRelease, include) {
const res = await qu.get(url);
if (res.ok) {
return scrapeScene(res.item, channel);
return scrapeScene(res.item, url, channel, include);
}
return res.status;

View File

@@ -7,7 +7,7 @@ const logger = require('../logger')(__filename);
function getVirtualConsole(filepath) {
const virtualConsole = new VirtualConsole();
const context = path.basename(filepath);
const context = path.basename(filepath).replace(path.extname(filepath), '');
virtualConsole.on('error', message => logger.warn(`Error from JSDOM in ${context}: ${message}`));
virtualConsole.on('jsdomError', message => logger.warn(`Error from JSDOM in ${context}: ${message}`));