Added movie support to MindGeek scraper.

This commit is contained in:
DebaucheryLibrarian 2022-03-04 23:31:59 +01:00
parent 50b7f521b5
commit c6e977f842
11 changed files with 122 additions and 50 deletions

View File

@ -108,6 +108,7 @@
:fetch-releases="fetchEntity" :fetch-releases="fetchEntity"
:items-total="totalCount" :items-total="totalCount"
:items-per-page="limit" :items-per-page="limit"
:available-tags="entity.tags"
/> />
<div class="releases"> <div class="releases">

View File

@ -3,12 +3,6 @@
<div class="content-inner"> <div class="content-inner">
<SearchBar :placeholder="`Search ${totalCount} movies`" /> <SearchBar :placeholder="`Search ${totalCount} movies`" />
<TagFilter
class="filters-filter"
:filter="filter"
:available-tags="availableTags"
/>
<div <div
ref="tiles" ref="tiles"
class="tiles" class="tiles"
@ -36,7 +30,6 @@
import MovieTile from './movie-tile.vue'; import MovieTile from './movie-tile.vue';
import SearchBar from '../search/bar.vue'; import SearchBar from '../search/bar.vue';
import Pagination from '../pagination/pagination.vue'; import Pagination from '../pagination/pagination.vue';
import TagFilter from '../filters/tag-filter.vue';
async function fetchMovies() { async function fetchMovies() {
if (this.$route.query.query) { if (this.$route.query.query) {
@ -80,7 +73,6 @@ export default {
MovieTile, MovieTile,
SearchBar, SearchBar,
Pagination, Pagination,
TagFilter,
}, },
data() { data() {
return { return {

View File

@ -105,6 +105,7 @@ function curateEntity(entity, parent, releases) {
}; };
if (entity.tags) curatedEntity.tags = entity.tags.map(({ tag }) => tag); if (entity.tags) curatedEntity.tags = entity.tags.map(({ tag }) => tag);
if (entity.sceneTags) curatedEntity.sceneTags = entity.sceneTags;
if (entity.children) { if (entity.children) {
if (entity.children.nodes) { if (entity.children.nodes) {

View File

@ -41,6 +41,11 @@ function initEntitiesActions(store, router) {
slug slug
} }
} }
sceneTags {
id
name
slug
}
children: childEntitiesConnection( children: childEntitiesConnection(
orderBy: [PRIORITY_DESC, NAME_ASC], orderBy: [PRIORITY_DESC, NAME_ASC],
filter: { filter: {

View File

@ -1,8 +0,0 @@
exports.up = async (knex) => knex.raw(`
CREATE VIEW movies_tagged AS
SELECT * FROM movies;
`);
exports.down = async (knex) => knex.raw(`
DROP VIEW IF EXISTS movies_tagged;
`);

View File

@ -0,0 +1,23 @@
exports.up = async (knex) => knex.raw(`
CREATE FUNCTION entities_scene_tags(entity entities, selectable_tags text[]) RETURNS SETOF tags AS $$
SELECT tags.*
FROM releases
LEFT JOIN
releases_tags ON releases_tags.release_id = releases.id
LEFT JOIN
tags ON tags.id = releases_tags.tag_id
WHERE
releases.entity_id = entity.id
AND
CASE WHEN array_length(selectable_tags, 1) IS NOT NULL
THEN tags.slug = ANY(selectable_tags)
ELSE true
END
GROUP BY tags.id
ORDER BY tags.name;
$$ LANGUAGE SQL STABLE;
`);
exports.down = async (knex) => knex.raw(`
DROP FUNCTION IF EXISTS entities_tags;
`);

View File

@ -20,6 +20,7 @@ const scrapers = require('./scrapers/scrapers').actors;
const argv = require('./argv'); const argv = require('./argv');
const include = require('./utils/argv-include')(argv); const include = require('./utils/argv-include')(argv);
const bulkInsert = require('./utils/bulk-insert'); const bulkInsert = require('./utils/bulk-insert');
const chunk = require('./utils/chunk');
const logger = require('./logger')(__filename); const logger = require('./logger')(__filename);
const { toBaseReleases } = require('./deep'); const { toBaseReleases } = require('./deep');
@ -1048,33 +1049,42 @@ async function flushProfiles(actorIdsOrNames) {
logger.info(`Removed ${deleteCount} profiles`); logger.info(`Removed ${deleteCount} profiles`);
} }
async function deleteActors(actorIdsOrNames) { async function deleteActors(allActorIdsOrNames) {
const actors = await knex('actors') const deleteCounts = await Promise.map(chunk(allActorIdsOrNames), async (actorIdsOrNames) => {
.whereIn('id', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'number')) const actors = await knex('actors')
.orWhere((builder) => { .whereIn('id', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'number'))
builder .orWhere((builder) => {
.whereIn('name', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'string')) builder
.whereNull('entity_id'); .whereIn('name', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'string'))
}); .whereNull('entity_id');
});
const actorIds = actors.map((actor) => actor.id); const actorIds = actors.map((actor) => actor.id);
const sceneIds = await knex('releases_actors') const sceneIds = await knex('releases_actors')
.select('releases.id') .select('releases.id')
.whereIn('actor_id', actorIds) .whereIn('actor_id', actorIds)
.leftJoin('releases', 'releases.id', 'releases_actors.release_id') .leftJoin('releases', 'releases.id', 'releases_actors.release_id')
.pluck('id'); .pluck('id');
const [deletedScenesCount, deletedActorsCount] = await Promise.all([ const [deletedScenesCount, deletedActorsCount] = await Promise.all([
deleteScenes(sceneIds), deleteScenes(sceneIds),
knex('actors') knex('actors')
.whereIn('id', actorIds) .whereIn('id', actorIds)
.delete(), .delete(),
]); ]);
return { deletedScenesCount, deletedActorsCount };
}, { concurrency: 10 });
const deletedActorsCount = deleteCounts.reduce((acc, count) => acc + count.deletedActorsCount, 0);
const deletedScenesCount = deleteCounts.reduce((acc, count) => acc + count.deletedScenesCount, 0);
await flushOrphanedMedia(); await flushOrphanedMedia();
logger.info(`Removed ${deletedActorsCount} actors with ${deletedScenesCount} scenes`); logger.info(`Removed ${deletedActorsCount} actors with ${deletedScenesCount} scenes`);
return deletedActorsCount;
} }
async function flushActors() { async function flushActors() {

View File

@ -961,9 +961,12 @@ async function flushOrphanedMedia() {
await deleteS3Objects(orphanedMedia.filter((media) => media.is_s3)); await deleteS3Objects(orphanedMedia.filter((media) => media.is_s3));
} }
await fsPromises.rm(path.join(config.media.path, 'temp'), { recursive: true }); try {
await fsPromises.rm(path.join(config.media.path, 'temp'), { recursive: true });
logger.info('Cleared temporary media directory'); logger.info('Cleared temporary media directory');
} catch (error) {
logger.warn(`Failed to clear temporary media directory: ${error.message}`);
}
} }
module.exports = { module.exports = {

View File

@ -11,6 +11,12 @@ const slugify = require('../utils/slugify');
const http = require('../utils/http'); const http = require('../utils/http');
const { inchesToCm, lbsToKg } = require('../utils/convert'); const { inchesToCm, lbsToKg } = require('../utils/convert');
function getBasePath(channel, path = '/scene') {
return channel.parameters?.scene
|| ((channel.parameters?.native || channel.type === 'network') && `${channel.url}${path}`)
|| `${channel.parent.url}${path}`;
}
function getThumbs(scene) { function getThumbs(scene) {
if (scene.images.poster) { if (scene.images.poster) {
return Object.values(scene.images.poster) // can be { 0: {}, 1: {}, ... } instead of array return Object.values(scene.images.poster) // can be { 0: {}, 1: {}, ... } instead of array
@ -18,7 +24,7 @@ function getThumbs(scene) {
.map((image) => image.xl.url); .map((image) => image.xl.url);
} }
if (scene.images.card_main_rect) { if (Array.isArray(scene.images.card_main_rect)) {
return scene.images.card_main_rect return scene.images.card_main_rect
.concat(scene.images.card_secondary_rect || []) .concat(scene.images.card_secondary_rect || [])
.map((image) => image.xl.url.replace('.thumb', '')); .map((image) => image.xl.url.replace('.thumb', ''));
@ -27,6 +33,20 @@ function getThumbs(scene) {
return []; return [];
} }
function getCovers(images) {
return [
[
images.cover[0].md?.url,
images.cover[0].sm?.url,
images.cover[0].xs?.url,
// bigger but usually upscaled
images.cover[0].xx?.url,
images.cover[0].xl?.url,
images.cover[0].lg?.url,
],
];
}
function getVideos(data) { function getVideos(data) {
const teaserSources = data.videos.mediabook?.files; const teaserSources = data.videos.mediabook?.files;
const trailerSources = data.children.find((child) => child.type === 'trailer')?.videos.full?.files; const trailerSources = data.children.find((child) => child.type === 'trailer')?.videos.full?.files;
@ -51,9 +71,7 @@ function scrapeLatestX(data, site, filterChannel) {
description: data.description, description: data.description,
}; };
const basepath = site.parameters?.scene const basepath = getBasePath(site);
|| (site.parameters?.native && `${site.url}/scene`)
|| `${site.parent.url}/scene`;
release.url = `${basepath}/${release.entryId}/${slugify(release.title)}`; release.url = `${basepath}/${release.entryId}/${slugify(release.title)}`;
release.date = new Date(data.dateReleased); release.date = new Date(data.dateReleased);
@ -96,7 +114,7 @@ async function scrapeLatest(items, site, filterChannel) {
}; };
} }
function scrapeScene(data, url, _site, networkName) { function scrapeRelease(data, url, channel, networkName) {
const release = {}; const release = {};
const { id: entryId, title, description } = data; const { id: entryId, title, description } = data;
@ -129,6 +147,29 @@ function scrapeScene(data, url, _site, networkName) {
release.url = url || `https://www.${networkName || data.brand}.com/scene/${entryId}/`; release.url = url || `https://www.${networkName || data.brand}.com/scene/${entryId}/`;
if (data.parent?.type === 'movie') {
release.movie = {
entryId: data.parent.id,
url: `${getBasePath(channel, '/movie')}/${data.parent.id}/${slugify(data.parent.title, '-', { removePunctuation: true })}`,
title: data.parent.title,
description: data.parent.description,
date: new Date(data.parent.dateReleased),
channel: slugify(data.parent.collections?.name || data.parent.brand),
covers: getCovers(data.parent.images),
shallow: true,
};
}
if (data.type === 'movie') {
release.covers = getCovers(data.images);
release.scenes = data.children?.map((scene) => ({
entryId: scene.id,
url: `${getBasePath(channel)}/${scene.id}/${slugify(scene.title)}`,
title: scene.title,
shallow: true,
}));
}
return release; return release;
} }
@ -230,7 +271,7 @@ function scrapeProfile(data, html, releases = [], networkName) {
profile.naturalBoobs = false; profile.naturalBoobs = false;
} }
profile.releases = releases.map((release) => scrapeScene(release, null, null, networkName)); profile.releases = releases.map((release) => scrapeRelease(release, null, null, networkName));
return profile; return profile;
} }
@ -292,8 +333,8 @@ async function fetchUpcoming(site, page, options) {
return res.statusCode; return res.statusCode;
} }
async function fetchScene(url, site, baseScene, options) { async function fetchRelease(url, site, baseScene, options) {
if (baseScene?.entryId) { if (baseScene?.entryId && !baseScene.shallow) {
// overview and deep data is the same, don't hit server unnecessarily // overview and deep data is the same, don't hit server unnecessarily
return baseScene; return baseScene;
} }
@ -312,7 +353,7 @@ async function fetchScene(url, site, baseScene, options) {
if (res.status === 200 && res.body.result) { if (res.status === 200 && res.body.result) {
return { return {
scene: scrapeScene(res.body.result, url, site), scene: scrapeRelease(res.body.result, url, site),
}; };
} }
@ -374,6 +415,7 @@ module.exports = {
scrapeLatestX, scrapeLatestX,
fetchLatest, fetchLatest,
fetchUpcoming, fetchUpcoming,
fetchScene, fetchScene: fetchRelease,
fetchMovie: fetchRelease,
fetchProfile, fetchProfile,
}; };

View File

@ -142,6 +142,7 @@ async function getTrailer(scene, channel, url) {
return null; return null;
} }
/*
async function getPhotosLegacy(url) { async function getPhotosLegacy(url) {
const htmlRes = await http.get(url, { const htmlRes = await http.get(url, {
extract: { extract: {
@ -169,6 +170,7 @@ async function getPhotosLegacy(url) {
return []; return [];
} }
} }
*/
async function getPhotos(url) { async function getPhotos(url) {
const htmlRes = await http.get(url, { const htmlRes = await http.get(url, {

View File

@ -392,7 +392,8 @@ async function associateMovieScenes(movies, movieScenes) {
return null; return null;
} }
const sceneMovie = moviesByEntityIdAndEntryId[scene.entity.id]?.[scene.movie.entryId]; const sceneMovie = moviesByEntityIdAndEntryId[scene.entity.id]?.[scene.movie.entryId]
|| moviesByEntityIdAndEntryId[scene.entity.parent?.id]?.[scene.movie.entryId];
if (sceneMovie?.id) { if (sceneMovie?.id) {
return { return {