Rescraping upcoming scenes. Fixed language and scene deep scraping for Dorcel scraper.

This commit is contained in:
DebaucheryLibrarian 2021-06-02 03:27:32 +02:00
parent 42791c528e
commit c979173422
15 changed files with 105 additions and 15 deletions

View File

@ -112,9 +112,9 @@
v-if="!me" v-if="!me"
class="item-container item-more" class="item-container item-more"
><router-link ><router-link
:to="{ name: 'signup', query: { ref: $route.path } }" :to="{ name: 'login', query: { ref: $route.path } }"
class="link" class="link"
>Sign up</router-link>&nbsp;for more photos, trailers and features!</div> >Log in</router-link>&nbsp;for more photos, trailers and features!</div>
</div> </div>
</div> </div>
</template> </template>

View File

@ -125,6 +125,10 @@ async function init() {
app.directive('tooltip', { app.directive('tooltip', {
beforeMount(el, binding) { beforeMount(el, binding) {
if (!binding.value) {
return;
}
// don't include HTML in native title attribute // don't include HTML in native title attribute
const textEl = document.createElement('div'); const textEl = document.createElement('div');
textEl.innerHTML = binding.value; textEl.innerHTML = binding.value;

View File

@ -183,7 +183,7 @@ function initReleasesActions(store, router) {
isS3 isS3
} }
} }
posters: moviesPosterByMovieId { poster: moviesPosterByMovieId {
media { media {
id id
path path

View File

@ -667,6 +667,11 @@ exports.up = knex => Promise.resolve()
.onDelete('cascade'); .onDelete('cascade');
table.datetime('created_at') table.datetime('created_at')
.notNullable()
.defaultTo(knex.fn.now());
table.datetime('updated_at')
.notNullable()
.defaultTo(knex.fn.now()); .defaultTo(knex.fn.now());
})) }))
.then(() => knex.schema.createTable('releases_actors', (table) => { .then(() => knex.schema.createTable('releases_actors', (table) => {

Binary file not shown.

After

Width:  |  Height:  |  Size: 72 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.8 KiB

View File

@ -50,10 +50,15 @@ function scrapeScene({ query }, url, channel) {
const fallbackPoster = query.img('.player img'); const fallbackPoster = query.img('.player img');
release.poster = query.sourceSet('.player img', 'data-srcset') || [fallbackPoster.replace('_crop', ''), fallbackPoster]; release.poster = query.sourceSet('.player img', 'data-srcset') || [fallbackPoster.replace('_crop', ''), fallbackPoster];
release.movie = { const movieUrl = query.url('.movie a', 'href', { origin: channel.url });
title: query.cnt('.movie a'),
url: query.url('.movie a', 'href', { origin: channel.url }), if (movieUrl) {
}; release.movie = {
entryId: new URL(movieUrl).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1],
title: query.cnt('.movie a'),
url: query.url('.movie a', 'href', { origin: channel.url }),
};
}
return release; return release;
} }
@ -92,8 +97,20 @@ function scrapeMovie({ query, el }, url, channel) {
avatar: query.sourceSet(actorEl, '.thumbnail img', 'data-srcset'), avatar: query.sourceSet(actorEl, '.thumbnail img', 'data-srcset'),
})); }));
release.poster = query.sourceSet('.banner', 'data-srcset'); release.poster = query.sourceSet('.banner', 'data-src')?.[0];
release.covers = [query.sourceSet('.cover', 'data-srcset')]; release.covers = [query.all(query.el('.cover').parentElement, 'source')
?.map(coverEl => query.sourceSet(coverEl, null, 'data-srcset'))
.flat()
.sort((coverA, coverB) => {
const resA = Number(coverA.match(/_(\d{3,})_/)?.[1]);
const resB = Number(coverB.match(/_(\d{3,})_/)?.[1]);
if (resA < resB) return 1;
if (resA > resB) return -1;
return 0;
})
.concat(query.sourceSet('.cover', 'data-src')?.[0])];
release.scenes = scrapeAll(qu.initAll(el, '.scene'), channel); release.scenes = scrapeAll(qu.initAll(el, '.scene'), channel);
@ -120,13 +137,25 @@ async function scrapeProfile({ query, el }, entity, avatar) {
return profile; return profile;
} }
async function fetchLatest(channel, page = 1) { async function beforeFetchLatest(channel) {
// scene page only seems to accept language preferences from session
const session = qu.session();
await qu.getAll(`${channel.url}/en/news-videos-x-marc-dorcel`, '.scene', {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
}, { session });
return session;
}
async function fetchLatest(channel, page = 1, options, { beforeFetchLatest: session }) {
const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`; const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`;
const res = await qu.getAll(url, '.scene', { const res = await qu.getAll(url, '.scene', {
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles 'Accept-Language': 'en-US,en', // fetch English rather than French titles
}); }, { session });
if (res.ok) { if (res.ok) {
return scrapeAll(res.items, channel); return scrapeAll(res.items, channel);
@ -152,8 +181,9 @@ async function fetchMovies(channel, page = 1) {
} }
async function fetchScene(url, channel) { async function fetchScene(url, channel) {
const res = await qu.get(url, '.content', { const res = await qu.get(url, null, {
'Accept-Language': 'en-US,en', // fetch English rather than French titles 'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: `${channel.url}/en/news-videos-x-marc-dorcel`,
}); });
if (res.ok) { if (res.ok) {
@ -166,6 +196,7 @@ async function fetchScene(url, channel) {
async function fetchMovie(url, channel) { async function fetchMovie(url, channel) {
const res = await qu.get(url, '.content', { const res = await qu.get(url, '.content', {
'Accept-Language': 'en-US,en', // fetch English rather than French titles 'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: `${channel.url}/en/porn-movie`,
}); });
if (res.ok) { if (res.ok) {
@ -202,6 +233,7 @@ async function fetchProfile(baseActor, { entity }) {
} }
module.exports = { module.exports = {
beforeFetchLatest,
fetchLatest, fetchLatest,
fetchScene, fetchScene,
fetchMovie, fetchMovie,

View File

@ -315,11 +315,33 @@ async function storeScenes(releases) {
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios); const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);
const curatedNewReleaseEntries = await Promise.all(uniqueReleases.map(release => curateReleaseEntry(release, batchId))); const curatedNewReleaseEntries = await Promise.all(uniqueReleases.map(release => curateReleaseEntry(release, batchId)));
const storedReleases = await bulkInsert('releases', curatedNewReleaseEntries); const storedReleases = await bulkInsert('releases', curatedNewReleaseEntries);
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : []; const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];
const releasesWithId = attachReleaseIds([].concat(uniqueReleases, duplicateReleases), [].concat(storedReleaseEntries, duplicateReleaseEntries));
const uniqueReleasesWithId = attachReleaseIds(uniqueReleases, storedReleaseEntries);
const duplicateReleasesWithId = attachReleaseIds(duplicateReleases, duplicateReleaseEntries);
const releasesWithId = uniqueReleasesWithId.concat(duplicateReleasesWithId);
try {
await knex.raw(`
UPDATE releases
SET url = COALESCE(new.url, releases.url),
date = COALESCE(new.date, releases.date),
title = COALESCE(new.title, releases.title),
description = COALESCE(new.description, releases.description),
duration = COALESCE(new.duration, releases.duration),
deep = new.url IS NOT NULL,
updated_at = NOW()
FROM json_to_recordset(:scenes)
AS new(id int, url text, date timestamptz, title text, description text, duration integer, deep boolean)
WHERE releases.id = new.id;
`, {
scenes: JSON.stringify(duplicateReleasesWithId),
});
} catch (error) {
console.log(error);
}
const [actors] = await Promise.all([ const [actors] = await Promise.all([
associateActors(releasesWithId, batchId), associateActors(releasesWithId, batchId),

View File

@ -44,7 +44,17 @@ async function filterUniqueReleases(releases) {
const duplicateReleaseEntries = await knex('releases') const duplicateReleaseEntries = await knex('releases')
.select(knex.raw('releases.*, row_to_json(entities) as entity')) .select(knex.raw('releases.*, row_to_json(entities) as entity'))
.leftJoin('entities', 'entities.id', 'releases.entity_id') .leftJoin('entities', 'entities.id', 'releases.entity_id')
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers); .whereIn(['entity_id', 'entry_id'], releaseIdentifiers)
.where((builder) => {
// check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates
builder
.where('deep', true) // scene is already deep scraped
.orWhereNull('date')
.orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now
.orWhere(knex.raw('NOW() - date > INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC
.orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updated expected
});
const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release)); const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release));
const duplicateReleasesByEntityIdAndEntryId = duplicateReleases.reduce(mapReleasesToEntityIdAndEntryId, {}); const duplicateReleasesByEntityIdAndEntryId = duplicateReleases.reduce(mapReleasesToEntityIdAndEntryId, {});

View File

@ -430,6 +430,10 @@ function init(context, selector, window) {
const element = selector ? context.querySelector(selector) : context; const element = selector ? context.querySelector(selector) : context;
if (!element) {
return null;
}
const legacyContextFuncs = Object.entries(legacyFuncs) // dynamically attach methods with context const legacyContextFuncs = Object.entries(legacyFuncs) // dynamically attach methods with context
.reduce((acc, [key, func]) => ({ .reduce((acc, [key, func]) => ({
...acc, ...acc,

13
src/utils/update.js Normal file
View File

@ -0,0 +1,13 @@
'use strict';
const knex = require('../knex');
async function init() {
const result = await knex.raw('SELECT * FROM json_to_recordset(:ids) AS x(id int)', {
ids: JSON.stringify([{ id: 1, foo: 'bar' }, { id: 2 }, { id: 3 }]),
});
console.log(result);
}
init();