Rescraping upcoming scenes. Fixed language and scene deep scraping for Dorcel scraper.
This commit is contained in:
parent
42791c528e
commit
c979173422
|
@ -112,9 +112,9 @@
|
|||
v-if="!me"
|
||||
class="item-container item-more"
|
||||
><router-link
|
||||
:to="{ name: 'signup', query: { ref: $route.path } }"
|
||||
:to="{ name: 'login', query: { ref: $route.path } }"
|
||||
class="link"
|
||||
>Sign up</router-link> for more photos, trailers and features!</div>
|
||||
>Log in</router-link> for more photos, trailers and features!</div>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
|
|
|
@ -125,6 +125,10 @@ async function init() {
|
|||
|
||||
app.directive('tooltip', {
|
||||
beforeMount(el, binding) {
|
||||
if (!binding.value) {
|
||||
return;
|
||||
}
|
||||
|
||||
// don't include HTML in native title attribute
|
||||
const textEl = document.createElement('div');
|
||||
textEl.innerHTML = binding.value;
|
||||
|
|
|
@ -183,7 +183,7 @@ function initReleasesActions(store, router) {
|
|||
isS3
|
||||
}
|
||||
}
|
||||
posters: moviesPosterByMovieId {
|
||||
poster: moviesPosterByMovieId {
|
||||
media {
|
||||
id
|
||||
path
|
||||
|
|
|
@ -667,6 +667,11 @@ exports.up = knex => Promise.resolve()
|
|||
.onDelete('cascade');
|
||||
|
||||
table.datetime('created_at')
|
||||
.notNullable()
|
||||
.defaultTo(knex.fn.now());
|
||||
|
||||
table.datetime('updated_at')
|
||||
.notNullable()
|
||||
.defaultTo(knex.fn.now());
|
||||
}))
|
||||
.then(() => knex.schema.createTable('releases_actors', (table) => {
|
||||
|
|
Binary file not shown.
After Width: | Height: | Size: 72 KiB |
Binary file not shown.
After Width: | Height: | Size: 16 KiB |
Binary file not shown.
After Width: | Height: | Size: 6.7 KiB |
Binary file not shown.
After Width: | Height: | Size: 8.4 KiB |
Binary file not shown.
After Width: | Height: | Size: 5.3 KiB |
Binary file not shown.
After Width: | Height: | Size: 8.8 KiB |
|
@ -50,10 +50,15 @@ function scrapeScene({ query }, url, channel) {
|
|||
const fallbackPoster = query.img('.player img');
|
||||
release.poster = query.sourceSet('.player img', 'data-srcset') || [fallbackPoster.replace('_crop', ''), fallbackPoster];
|
||||
|
||||
const movieUrl = query.url('.movie a', 'href', { origin: channel.url });
|
||||
|
||||
if (movieUrl) {
|
||||
release.movie = {
|
||||
entryId: new URL(movieUrl).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1],
|
||||
title: query.cnt('.movie a'),
|
||||
url: query.url('.movie a', 'href', { origin: channel.url }),
|
||||
};
|
||||
}
|
||||
|
||||
return release;
|
||||
}
|
||||
|
@ -92,8 +97,20 @@ function scrapeMovie({ query, el }, url, channel) {
|
|||
avatar: query.sourceSet(actorEl, '.thumbnail img', 'data-srcset'),
|
||||
}));
|
||||
|
||||
release.poster = query.sourceSet('.banner', 'data-srcset');
|
||||
release.covers = [query.sourceSet('.cover', 'data-srcset')];
|
||||
release.poster = query.sourceSet('.banner', 'data-src')?.[0];
|
||||
release.covers = [query.all(query.el('.cover').parentElement, 'source')
|
||||
?.map(coverEl => query.sourceSet(coverEl, null, 'data-srcset'))
|
||||
.flat()
|
||||
.sort((coverA, coverB) => {
|
||||
const resA = Number(coverA.match(/_(\d{3,})_/)?.[1]);
|
||||
const resB = Number(coverB.match(/_(\d{3,})_/)?.[1]);
|
||||
|
||||
if (resA < resB) return 1;
|
||||
if (resA > resB) return -1;
|
||||
|
||||
return 0;
|
||||
})
|
||||
.concat(query.sourceSet('.cover', 'data-src')?.[0])];
|
||||
|
||||
release.scenes = scrapeAll(qu.initAll(el, '.scene'), channel);
|
||||
|
||||
|
@ -120,13 +137,25 @@ async function scrapeProfile({ query, el }, entity, avatar) {
|
|||
return profile;
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page = 1) {
|
||||
async function beforeFetchLatest(channel) {
|
||||
// scene page only seems to accept language preferences from session
|
||||
const session = qu.session();
|
||||
|
||||
await qu.getAll(`${channel.url}/en/news-videos-x-marc-dorcel`, '.scene', {
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||
}, { session });
|
||||
|
||||
return session;
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page = 1, options, { beforeFetchLatest: session }) {
|
||||
const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`;
|
||||
|
||||
const res = await qu.getAll(url, '.scene', {
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||
});
|
||||
}, { session });
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.items, channel);
|
||||
|
@ -152,8 +181,9 @@ async function fetchMovies(channel, page = 1) {
|
|||
}
|
||||
|
||||
async function fetchScene(url, channel) {
|
||||
const res = await qu.get(url, '.content', {
|
||||
const res = await qu.get(url, null, {
|
||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||
Referer: `${channel.url}/en/news-videos-x-marc-dorcel`,
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
|
@ -166,6 +196,7 @@ async function fetchScene(url, channel) {
|
|||
async function fetchMovie(url, channel) {
|
||||
const res = await qu.get(url, '.content', {
|
||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||
Referer: `${channel.url}/en/porn-movie`,
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
|
@ -202,6 +233,7 @@ async function fetchProfile(baseActor, { entity }) {
|
|||
}
|
||||
|
||||
module.exports = {
|
||||
beforeFetchLatest,
|
||||
fetchLatest,
|
||||
fetchScene,
|
||||
fetchMovie,
|
||||
|
|
|
@ -315,11 +315,33 @@ async function storeScenes(releases) {
|
|||
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);
|
||||
|
||||
const curatedNewReleaseEntries = await Promise.all(uniqueReleases.map(release => curateReleaseEntry(release, batchId)));
|
||||
|
||||
const storedReleases = await bulkInsert('releases', curatedNewReleaseEntries);
|
||||
|
||||
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];
|
||||
const releasesWithId = attachReleaseIds([].concat(uniqueReleases, duplicateReleases), [].concat(storedReleaseEntries, duplicateReleaseEntries));
|
||||
|
||||
const uniqueReleasesWithId = attachReleaseIds(uniqueReleases, storedReleaseEntries);
|
||||
const duplicateReleasesWithId = attachReleaseIds(duplicateReleases, duplicateReleaseEntries);
|
||||
const releasesWithId = uniqueReleasesWithId.concat(duplicateReleasesWithId);
|
||||
|
||||
try {
|
||||
await knex.raw(`
|
||||
UPDATE releases
|
||||
SET url = COALESCE(new.url, releases.url),
|
||||
date = COALESCE(new.date, releases.date),
|
||||
title = COALESCE(new.title, releases.title),
|
||||
description = COALESCE(new.description, releases.description),
|
||||
duration = COALESCE(new.duration, releases.duration),
|
||||
deep = new.url IS NOT NULL,
|
||||
updated_at = NOW()
|
||||
FROM json_to_recordset(:scenes)
|
||||
AS new(id int, url text, date timestamptz, title text, description text, duration integer, deep boolean)
|
||||
WHERE releases.id = new.id;
|
||||
`, {
|
||||
scenes: JSON.stringify(duplicateReleasesWithId),
|
||||
});
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const [actors] = await Promise.all([
|
||||
associateActors(releasesWithId, batchId),
|
||||
|
|
|
@ -44,7 +44,17 @@ async function filterUniqueReleases(releases) {
|
|||
const duplicateReleaseEntries = await knex('releases')
|
||||
.select(knex.raw('releases.*, row_to_json(entities) as entity'))
|
||||
.leftJoin('entities', 'entities.id', 'releases.entity_id')
|
||||
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers);
|
||||
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers)
|
||||
.where((builder) => {
|
||||
// check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates
|
||||
builder
|
||||
.where('deep', true) // scene is already deep scraped
|
||||
.orWhereNull('date')
|
||||
.orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now
|
||||
.orWhere(knex.raw('NOW() - date > INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC
|
||||
.orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updated expected
|
||||
});
|
||||
|
||||
|
||||
const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release));
|
||||
const duplicateReleasesByEntityIdAndEntryId = duplicateReleases.reduce(mapReleasesToEntityIdAndEntryId, {});
|
||||
|
|
|
@ -430,6 +430,10 @@ function init(context, selector, window) {
|
|||
|
||||
const element = selector ? context.querySelector(selector) : context;
|
||||
|
||||
if (!element) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const legacyContextFuncs = Object.entries(legacyFuncs) // dynamically attach methods with context
|
||||
.reduce((acc, [key, func]) => ({
|
||||
...acc,
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
'use strict';
|
||||
|
||||
const knex = require('../knex');
|
||||
|
||||
async function init() {
|
||||
const result = await knex.raw('SELECT * FROM json_to_recordset(:ids) AS x(id int)', {
|
||||
ids: JSON.stringify([{ id: 1, foo: 'bar' }, { id: 2 }, { id: 3 }]),
|
||||
});
|
||||
|
||||
console.log(result);
|
||||
}
|
||||
|
||||
init();
|
Loading…
Reference in New Issue