Rescraping upcoming scenes. Fixed language and scene deep scraping for Dorcel scraper.

This commit is contained in:
DebaucheryLibrarian 2021-06-02 03:27:32 +02:00
parent 42791c528e
commit c979173422
15 changed files with 105 additions and 15 deletions

View File

@ -112,9 +112,9 @@
v-if="!me"
class="item-container item-more"
><router-link
:to="{ name: 'signup', query: { ref: $route.path } }"
:to="{ name: 'login', query: { ref: $route.path } }"
class="link"
>Sign up</router-link>&nbsp;for more photos, trailers and features!</div>
>Log in</router-link>&nbsp;for more photos, trailers and features!</div>
</div>
</div>
</template>

View File

@ -125,6 +125,10 @@ async function init() {
app.directive('tooltip', {
beforeMount(el, binding) {
if (!binding.value) {
return;
}
// don't include HTML in native title attribute
const textEl = document.createElement('div');
textEl.innerHTML = binding.value;

View File

@ -183,7 +183,7 @@ function initReleasesActions(store, router) {
isS3
}
}
posters: moviesPosterByMovieId {
poster: moviesPosterByMovieId {
media {
id
path

View File

@ -667,6 +667,11 @@ exports.up = knex => Promise.resolve()
.onDelete('cascade');
table.datetime('created_at')
.notNullable()
.defaultTo(knex.fn.now());
table.datetime('updated_at')
.notNullable()
.defaultTo(knex.fn.now());
}))
.then(() => knex.schema.createTable('releases_actors', (table) => {

Binary file not shown.

After

Width:  |  Height:  |  Size: 72 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.8 KiB

View File

@ -50,10 +50,15 @@ function scrapeScene({ query }, url, channel) {
const fallbackPoster = query.img('.player img');
release.poster = query.sourceSet('.player img', 'data-srcset') || [fallbackPoster.replace('_crop', ''), fallbackPoster];
release.movie = {
title: query.cnt('.movie a'),
url: query.url('.movie a', 'href', { origin: channel.url }),
};
const movieUrl = query.url('.movie a', 'href', { origin: channel.url });
if (movieUrl) {
release.movie = {
entryId: new URL(movieUrl).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1],
title: query.cnt('.movie a'),
url: query.url('.movie a', 'href', { origin: channel.url }),
};
}
return release;
}
@ -92,8 +97,20 @@ function scrapeMovie({ query, el }, url, channel) {
avatar: query.sourceSet(actorEl, '.thumbnail img', 'data-srcset'),
}));
release.poster = query.sourceSet('.banner', 'data-srcset');
release.covers = [query.sourceSet('.cover', 'data-srcset')];
release.poster = query.sourceSet('.banner', 'data-src')?.[0];
release.covers = [query.all(query.el('.cover').parentElement, 'source')
?.map(coverEl => query.sourceSet(coverEl, null, 'data-srcset'))
.flat()
.sort((coverA, coverB) => {
const resA = Number(coverA.match(/_(\d{3,})_/)?.[1]);
const resB = Number(coverB.match(/_(\d{3,})_/)?.[1]);
if (resA < resB) return 1;
if (resA > resB) return -1;
return 0;
})
.concat(query.sourceSet('.cover', 'data-src')?.[0])];
release.scenes = scrapeAll(qu.initAll(el, '.scene'), channel);
@ -120,13 +137,25 @@ async function scrapeProfile({ query, el }, entity, avatar) {
return profile;
}
async function fetchLatest(channel, page = 1) {
async function beforeFetchLatest(channel) {
// scene page only seems to accept language preferences from session
const session = qu.session();
await qu.getAll(`${channel.url}/en/news-videos-x-marc-dorcel`, '.scene', {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
}, { session });
return session;
}
async function fetchLatest(channel, page = 1, options, { beforeFetchLatest: session }) {
const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`;
const res = await qu.getAll(url, '.scene', {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
});
}, { session });
if (res.ok) {
return scrapeAll(res.items, channel);
@ -152,8 +181,9 @@ async function fetchMovies(channel, page = 1) {
}
async function fetchScene(url, channel) {
const res = await qu.get(url, '.content', {
const res = await qu.get(url, null, {
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: `${channel.url}/en/news-videos-x-marc-dorcel`,
});
if (res.ok) {
@ -166,6 +196,7 @@ async function fetchScene(url, channel) {
async function fetchMovie(url, channel) {
const res = await qu.get(url, '.content', {
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: `${channel.url}/en/porn-movie`,
});
if (res.ok) {
@ -202,6 +233,7 @@ async function fetchProfile(baseActor, { entity }) {
}
module.exports = {
beforeFetchLatest,
fetchLatest,
fetchScene,
fetchMovie,

View File

@ -315,11 +315,33 @@ async function storeScenes(releases) {
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);
const curatedNewReleaseEntries = await Promise.all(uniqueReleases.map(release => curateReleaseEntry(release, batchId)));
const storedReleases = await bulkInsert('releases', curatedNewReleaseEntries);
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];
const releasesWithId = attachReleaseIds([].concat(uniqueReleases, duplicateReleases), [].concat(storedReleaseEntries, duplicateReleaseEntries));
const uniqueReleasesWithId = attachReleaseIds(uniqueReleases, storedReleaseEntries);
const duplicateReleasesWithId = attachReleaseIds(duplicateReleases, duplicateReleaseEntries);
const releasesWithId = uniqueReleasesWithId.concat(duplicateReleasesWithId);
try {
await knex.raw(`
UPDATE releases
SET url = COALESCE(new.url, releases.url),
date = COALESCE(new.date, releases.date),
title = COALESCE(new.title, releases.title),
description = COALESCE(new.description, releases.description),
duration = COALESCE(new.duration, releases.duration),
deep = new.url IS NOT NULL,
updated_at = NOW()
FROM json_to_recordset(:scenes)
AS new(id int, url text, date timestamptz, title text, description text, duration integer, deep boolean)
WHERE releases.id = new.id;
`, {
scenes: JSON.stringify(duplicateReleasesWithId),
});
} catch (error) {
console.log(error);
}
const [actors] = await Promise.all([
associateActors(releasesWithId, batchId),

View File

@ -44,7 +44,17 @@ async function filterUniqueReleases(releases) {
const duplicateReleaseEntries = await knex('releases')
.select(knex.raw('releases.*, row_to_json(entities) as entity'))
.leftJoin('entities', 'entities.id', 'releases.entity_id')
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers);
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers)
.where((builder) => {
// check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates
builder
.where('deep', true) // scene is already deep scraped
.orWhereNull('date')
.orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now
.orWhere(knex.raw('NOW() - date > INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC
.orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updated expected
});
const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release));
const duplicateReleasesByEntityIdAndEntryId = duplicateReleases.reduce(mapReleasesToEntityIdAndEntryId, {});

View File

@ -430,6 +430,10 @@ function init(context, selector, window) {
const element = selector ? context.querySelector(selector) : context;
if (!element) {
return null;
}
const legacyContextFuncs = Object.entries(legacyFuncs) // dynamically attach methods with context
.reduce((acc, [key, func]) => ({
...acc,

13
src/utils/update.js Normal file
View File

@ -0,0 +1,13 @@
'use strict';
const knex = require('../knex');
async function init() {
const result = await knex.raw('SELECT * FROM json_to_recordset(:ids) AS x(id int)', {
ids: JSON.stringify([{ id: 1, foo: 'bar' }, { id: 2 }, { id: 3 }]),
});
console.log(result);
}
init();