Rescraping upcoming scenes. Fixed language and scene deep scraping for Dorcel scraper.
This commit is contained in:
parent
42791c528e
commit
c979173422
|
@ -112,9 +112,9 @@
|
||||||
v-if="!me"
|
v-if="!me"
|
||||||
class="item-container item-more"
|
class="item-container item-more"
|
||||||
><router-link
|
><router-link
|
||||||
:to="{ name: 'signup', query: { ref: $route.path } }"
|
:to="{ name: 'login', query: { ref: $route.path } }"
|
||||||
class="link"
|
class="link"
|
||||||
>Sign up</router-link> for more photos, trailers and features!</div>
|
>Log in</router-link> for more photos, trailers and features!</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</template>
|
</template>
|
||||||
|
|
|
@ -125,6 +125,10 @@ async function init() {
|
||||||
|
|
||||||
app.directive('tooltip', {
|
app.directive('tooltip', {
|
||||||
beforeMount(el, binding) {
|
beforeMount(el, binding) {
|
||||||
|
if (!binding.value) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// don't include HTML in native title attribute
|
// don't include HTML in native title attribute
|
||||||
const textEl = document.createElement('div');
|
const textEl = document.createElement('div');
|
||||||
textEl.innerHTML = binding.value;
|
textEl.innerHTML = binding.value;
|
||||||
|
|
|
@ -183,7 +183,7 @@ function initReleasesActions(store, router) {
|
||||||
isS3
|
isS3
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
posters: moviesPosterByMovieId {
|
poster: moviesPosterByMovieId {
|
||||||
media {
|
media {
|
||||||
id
|
id
|
||||||
path
|
path
|
||||||
|
|
|
@ -667,6 +667,11 @@ exports.up = knex => Promise.resolve()
|
||||||
.onDelete('cascade');
|
.onDelete('cascade');
|
||||||
|
|
||||||
table.datetime('created_at')
|
table.datetime('created_at')
|
||||||
|
.notNullable()
|
||||||
|
.defaultTo(knex.fn.now());
|
||||||
|
|
||||||
|
table.datetime('updated_at')
|
||||||
|
.notNullable()
|
||||||
.defaultTo(knex.fn.now());
|
.defaultTo(knex.fn.now());
|
||||||
}))
|
}))
|
||||||
.then(() => knex.schema.createTable('releases_actors', (table) => {
|
.then(() => knex.schema.createTable('releases_actors', (table) => {
|
||||||
|
|
Binary file not shown.
After Width: | Height: | Size: 72 KiB |
Binary file not shown.
After Width: | Height: | Size: 16 KiB |
Binary file not shown.
After Width: | Height: | Size: 6.7 KiB |
Binary file not shown.
After Width: | Height: | Size: 8.4 KiB |
Binary file not shown.
After Width: | Height: | Size: 5.3 KiB |
Binary file not shown.
After Width: | Height: | Size: 8.8 KiB |
|
@ -50,10 +50,15 @@ function scrapeScene({ query }, url, channel) {
|
||||||
const fallbackPoster = query.img('.player img');
|
const fallbackPoster = query.img('.player img');
|
||||||
release.poster = query.sourceSet('.player img', 'data-srcset') || [fallbackPoster.replace('_crop', ''), fallbackPoster];
|
release.poster = query.sourceSet('.player img', 'data-srcset') || [fallbackPoster.replace('_crop', ''), fallbackPoster];
|
||||||
|
|
||||||
|
const movieUrl = query.url('.movie a', 'href', { origin: channel.url });
|
||||||
|
|
||||||
|
if (movieUrl) {
|
||||||
release.movie = {
|
release.movie = {
|
||||||
|
entryId: new URL(movieUrl).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1],
|
||||||
title: query.cnt('.movie a'),
|
title: query.cnt('.movie a'),
|
||||||
url: query.url('.movie a', 'href', { origin: channel.url }),
|
url: query.url('.movie a', 'href', { origin: channel.url }),
|
||||||
};
|
};
|
||||||
|
}
|
||||||
|
|
||||||
return release;
|
return release;
|
||||||
}
|
}
|
||||||
|
@ -92,8 +97,20 @@ function scrapeMovie({ query, el }, url, channel) {
|
||||||
avatar: query.sourceSet(actorEl, '.thumbnail img', 'data-srcset'),
|
avatar: query.sourceSet(actorEl, '.thumbnail img', 'data-srcset'),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
release.poster = query.sourceSet('.banner', 'data-srcset');
|
release.poster = query.sourceSet('.banner', 'data-src')?.[0];
|
||||||
release.covers = [query.sourceSet('.cover', 'data-srcset')];
|
release.covers = [query.all(query.el('.cover').parentElement, 'source')
|
||||||
|
?.map(coverEl => query.sourceSet(coverEl, null, 'data-srcset'))
|
||||||
|
.flat()
|
||||||
|
.sort((coverA, coverB) => {
|
||||||
|
const resA = Number(coverA.match(/_(\d{3,})_/)?.[1]);
|
||||||
|
const resB = Number(coverB.match(/_(\d{3,})_/)?.[1]);
|
||||||
|
|
||||||
|
if (resA < resB) return 1;
|
||||||
|
if (resA > resB) return -1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
})
|
||||||
|
.concat(query.sourceSet('.cover', 'data-src')?.[0])];
|
||||||
|
|
||||||
release.scenes = scrapeAll(qu.initAll(el, '.scene'), channel);
|
release.scenes = scrapeAll(qu.initAll(el, '.scene'), channel);
|
||||||
|
|
||||||
|
@ -120,13 +137,25 @@ async function scrapeProfile({ query, el }, entity, avatar) {
|
||||||
return profile;
|
return profile;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchLatest(channel, page = 1) {
|
async function beforeFetchLatest(channel) {
|
||||||
|
// scene page only seems to accept language preferences from session
|
||||||
|
const session = qu.session();
|
||||||
|
|
||||||
|
await qu.getAll(`${channel.url}/en/news-videos-x-marc-dorcel`, '.scene', {
|
||||||
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
|
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||||
|
}, { session });
|
||||||
|
|
||||||
|
return session;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchLatest(channel, page = 1, options, { beforeFetchLatest: session }) {
|
||||||
const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`;
|
const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`;
|
||||||
|
|
||||||
const res = await qu.getAll(url, '.scene', {
|
const res = await qu.getAll(url, '.scene', {
|
||||||
'X-Requested-With': 'XMLHttpRequest',
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||||
});
|
}, { session });
|
||||||
|
|
||||||
if (res.ok) {
|
if (res.ok) {
|
||||||
return scrapeAll(res.items, channel);
|
return scrapeAll(res.items, channel);
|
||||||
|
@ -152,8 +181,9 @@ async function fetchMovies(channel, page = 1) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchScene(url, channel) {
|
async function fetchScene(url, channel) {
|
||||||
const res = await qu.get(url, '.content', {
|
const res = await qu.get(url, null, {
|
||||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||||
|
Referer: `${channel.url}/en/news-videos-x-marc-dorcel`,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (res.ok) {
|
if (res.ok) {
|
||||||
|
@ -166,6 +196,7 @@ async function fetchScene(url, channel) {
|
||||||
async function fetchMovie(url, channel) {
|
async function fetchMovie(url, channel) {
|
||||||
const res = await qu.get(url, '.content', {
|
const res = await qu.get(url, '.content', {
|
||||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||||
|
Referer: `${channel.url}/en/porn-movie`,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (res.ok) {
|
if (res.ok) {
|
||||||
|
@ -202,6 +233,7 @@ async function fetchProfile(baseActor, { entity }) {
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
beforeFetchLatest,
|
||||||
fetchLatest,
|
fetchLatest,
|
||||||
fetchScene,
|
fetchScene,
|
||||||
fetchMovie,
|
fetchMovie,
|
||||||
|
|
|
@ -315,11 +315,33 @@ async function storeScenes(releases) {
|
||||||
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);
|
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);
|
||||||
|
|
||||||
const curatedNewReleaseEntries = await Promise.all(uniqueReleases.map(release => curateReleaseEntry(release, batchId)));
|
const curatedNewReleaseEntries = await Promise.all(uniqueReleases.map(release => curateReleaseEntry(release, batchId)));
|
||||||
|
|
||||||
const storedReleases = await bulkInsert('releases', curatedNewReleaseEntries);
|
const storedReleases = await bulkInsert('releases', curatedNewReleaseEntries);
|
||||||
|
|
||||||
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];
|
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];
|
||||||
const releasesWithId = attachReleaseIds([].concat(uniqueReleases, duplicateReleases), [].concat(storedReleaseEntries, duplicateReleaseEntries));
|
|
||||||
|
const uniqueReleasesWithId = attachReleaseIds(uniqueReleases, storedReleaseEntries);
|
||||||
|
const duplicateReleasesWithId = attachReleaseIds(duplicateReleases, duplicateReleaseEntries);
|
||||||
|
const releasesWithId = uniqueReleasesWithId.concat(duplicateReleasesWithId);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await knex.raw(`
|
||||||
|
UPDATE releases
|
||||||
|
SET url = COALESCE(new.url, releases.url),
|
||||||
|
date = COALESCE(new.date, releases.date),
|
||||||
|
title = COALESCE(new.title, releases.title),
|
||||||
|
description = COALESCE(new.description, releases.description),
|
||||||
|
duration = COALESCE(new.duration, releases.duration),
|
||||||
|
deep = new.url IS NOT NULL,
|
||||||
|
updated_at = NOW()
|
||||||
|
FROM json_to_recordset(:scenes)
|
||||||
|
AS new(id int, url text, date timestamptz, title text, description text, duration integer, deep boolean)
|
||||||
|
WHERE releases.id = new.id;
|
||||||
|
`, {
|
||||||
|
scenes: JSON.stringify(duplicateReleasesWithId),
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.log(error);
|
||||||
|
}
|
||||||
|
|
||||||
const [actors] = await Promise.all([
|
const [actors] = await Promise.all([
|
||||||
associateActors(releasesWithId, batchId),
|
associateActors(releasesWithId, batchId),
|
||||||
|
|
|
@ -44,7 +44,17 @@ async function filterUniqueReleases(releases) {
|
||||||
const duplicateReleaseEntries = await knex('releases')
|
const duplicateReleaseEntries = await knex('releases')
|
||||||
.select(knex.raw('releases.*, row_to_json(entities) as entity'))
|
.select(knex.raw('releases.*, row_to_json(entities) as entity'))
|
||||||
.leftJoin('entities', 'entities.id', 'releases.entity_id')
|
.leftJoin('entities', 'entities.id', 'releases.entity_id')
|
||||||
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers);
|
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers)
|
||||||
|
.where((builder) => {
|
||||||
|
// check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates
|
||||||
|
builder
|
||||||
|
.where('deep', true) // scene is already deep scraped
|
||||||
|
.orWhereNull('date')
|
||||||
|
.orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now
|
||||||
|
.orWhere(knex.raw('NOW() - date > INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC
|
||||||
|
.orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updated expected
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release));
|
const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release));
|
||||||
const duplicateReleasesByEntityIdAndEntryId = duplicateReleases.reduce(mapReleasesToEntityIdAndEntryId, {});
|
const duplicateReleasesByEntityIdAndEntryId = duplicateReleases.reduce(mapReleasesToEntityIdAndEntryId, {});
|
||||||
|
|
|
@ -430,6 +430,10 @@ function init(context, selector, window) {
|
||||||
|
|
||||||
const element = selector ? context.querySelector(selector) : context;
|
const element = selector ? context.querySelector(selector) : context;
|
||||||
|
|
||||||
|
if (!element) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
const legacyContextFuncs = Object.entries(legacyFuncs) // dynamically attach methods with context
|
const legacyContextFuncs = Object.entries(legacyFuncs) // dynamically attach methods with context
|
||||||
.reduce((acc, [key, func]) => ({
|
.reduce((acc, [key, func]) => ({
|
||||||
...acc,
|
...acc,
|
||||||
|
|
|
@ -0,0 +1,13 @@
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
const knex = require('../knex');
|
||||||
|
|
||||||
|
async function init() {
|
||||||
|
const result = await knex.raw('SELECT * FROM json_to_recordset(:ids) AS x(id int)', {
|
||||||
|
ids: JSON.stringify([{ id: 1, foo: 'bar' }, { id: 2 }, { id: 3 }]),
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
init();
|
Loading…
Reference in New Issue