Added upcoming, profile and detailed scene actor scraping to InTheCrack. Fixed clip upsert.

This commit is contained in:
DebaucheryLibrarian 2020-08-20 23:35:18 +02:00
parent 552e6da392
commit 4ec89e2cc8
16 changed files with 188 additions and 35 deletions

View File

@ -86,12 +86,14 @@ export default {
} }
.clip-duration { .clip-duration {
background: var(--darken);
color: var(--text-light); color: var(--text-light);
display: block; display: block;
position: absolute; position: absolute;
bottom: 0; top: 0;
left: 0; right: 0;
padding: .5rem .5rem .75rem 1rem; padding: .25rem .5rem;
font-size: .9rem;
font-weight: bold; font-weight: bold;
text-shadow: 0 0 2px var(--darken-strong); text-shadow: 0 0 2px var(--darken-strong);
} }

View File

@ -3,7 +3,6 @@
<div class="column"> <div class="column">
<div class="tidbits"> <div class="tidbits">
<a <a
v-if="release.date"
:title="release.url && `View scene on ${release.entity.name}`" :title="release.url && `View scene on ${release.entity.name}`"
:href="release.url" :href="release.url"
:class="{ link: release.url }" :class="{ link: release.url }"
@ -11,8 +10,8 @@
rel="noopener noreferrer" rel="noopener noreferrer"
class="tidbit date nolink" class="tidbit date nolink"
> >
<span class="date-compact">{{ formatDate(release.date, 'MMM D, YYYY', release.datePrecision) }}</span> <span class="date-compact">{{ release.date ? formatDate(release.date, 'MMM D, YYYY', release.datePrecision) : 'Date N/A' }}</span>
<span class="date-full">{{ formatDate(release.date, 'MMMM D, YYYY', release.datePrecision) }}</span> <span class="date-full">{{ release.date ? formatDate(release.date, 'MMMM D, YYYY', release.datePrecision) : 'Date unknown' }}</span>
<Icon <Icon
v-if="release.url" v-if="release.url"

View File

@ -103,6 +103,7 @@ async function mounted() {
'femdom', 'femdom',
], ],
toys: [ toys: [
'anal-toys',
'double-dildo', 'double-dildo',
'double-dildo-blowjob', 'double-dildo-blowjob',
], ],

Binary file not shown.

After

Width:  |  Height:  |  Size: 372 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 932 KiB

After

Width:  |  Height:  |  Size: 1019 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.1 KiB

After

Width:  |  Height:  |  Size: 6.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 34 KiB

After

Width:  |  Height:  |  Size: 26 KiB

View File

@ -2649,7 +2649,7 @@ const sites = [
{ {
slug: 'inthecrack', slug: 'inthecrack',
name: 'InTheCrack', name: 'InTheCrack',
url: 'https://inthecrack.com/', url: 'https://inthecrack.com',
}, },
// INTERRACIAL PASS // INTERRACIAL PASS
{ {

View File

@ -589,6 +589,7 @@ const tagPosters = [
['airtight', 6, 'Remy Lacroix in "Ass Worship 14" for Jules Jordan'], ['airtight', 6, 'Remy Lacroix in "Ass Worship 14" for Jules Jordan'],
['anal', 0, 'Adriana Chechik in "Manuel Creampies Their Asses 3" for Jules Jordan'], ['anal', 0, 'Adriana Chechik in "Manuel Creampies Their Asses 3" for Jules Jordan'],
['anal-creampie', 1, 'Aleska Diamond in "Aleska Wants More" for Asshole Fever'], ['anal-creampie', 1, 'Aleska Diamond in "Aleska Wants More" for Asshole Fever'],
['anal-toys', 0, 'Kira Noir in 1225 for InTheCrack'],
['ass-eating', 0, 'Angelica Heart and Leanna Sweet in "ATM Bitches" for Asshole Fever'], ['ass-eating', 0, 'Angelica Heart and Leanna Sweet in "ATM Bitches" for Asshole Fever'],
['asian', 0, 'Jade Kush for Erotica X'], ['asian', 0, 'Jade Kush for Erotica X'],
['atm', 2, 'Jureka Del Mar in "Stretched Out" for Her Limit'], ['atm', 2, 'Jureka Del Mar in "Stretched Out" for Her Limit'],

View File

@ -67,6 +67,7 @@ const { argv } = yargs
describe: 'Fetch all scenes for an actor', describe: 'Fetch all scenes for an actor',
type: 'boolean', type: 'boolean',
default: false, default: false,
alias: 'actor-scenes',
}) })
.option('actors-sources', { .option('actors-sources', {
describe: 'Use these scrapers for actor data', describe: 'Use these scrapers for actor data',

View File

@ -135,6 +135,10 @@ async function scrapeRelease(baseRelease, entities, type = 'scene') {
} catch (error) { } catch (error) {
logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`); logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`);
if (argv.debug) {
console.error(error);
}
if (error.code === 'NO_ENTRY_ID') { if (error.code === 'NO_ENTRY_ID') {
return null; return null;
} }

View File

@ -4,17 +4,19 @@ const moment = require('moment');
const qu = require('../utils/q'); const qu = require('../utils/q');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
const { feetInchesToCm, lbsToKg } = require('../utils/convert');
function scrapeAll(scenes, channel) { function scrapeAll(scenes, channel) {
return scenes.map(({ query }) => { return scenes.map(({ query }) => {
const release = {}; const release = {};
release.url = query.url('a', 'href', { origin: channel.url }); release.url = query.url('a', 'href', { origin: channel.url });
release.entryId = new URL(release.url).pathname.match(/\/Collection\/(\d+)/)[1]; // release.entryId = new URL(release.url).pathname.match(/\/Collection\/(\d+)/)[1]; can't be matched with upcoming scenes
release.shootId = query.cnt('a span:nth-of-type(1)').match(/^\d+/)?.[0]; release.shootId = query.cnt('a span:nth-of-type(1)').match(/^\d+/)?.[0];
release.date = query.date('a span:nth-of-type(2)', 'YYYY-MM-DD'); release.entryId = release.shootId;
release.date = query.date('a span:nth-of-type(2)', 'YYYY-MM-DD');
release.actors = (query.q('a img', 'alt') || query.cnt('a span:nth-of-type(1)'))?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g); release.actors = (query.q('a img', 'alt') || query.cnt('a span:nth-of-type(1)'))?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g);
release.poster = release.shootId release.poster = release.shootId
@ -25,13 +27,145 @@ function scrapeAll(scenes, channel) {
}); });
} }
function scrapeScene({ query, html }, url, channel) { function scrapeUpcoming(scenes, channel) {
return scenes.map(({ query }) => {
const release = {}; const release = {};
release.entryId = new URL(url).pathname.match(/\/Collection\/(\d+)/)[1]; const title = query.cnt('span');
release.shootId = query.cnt('h2 span').match(/^\d+/)?.[0];
release.actors = query.cnt('h2 span')?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g); release.entryId = title.match(/^\d+/)[0];
release.actors = title.slice(0, title.indexOf('-')).match(/[a-zA-Z]+(\s[a-zA-Z]+)*/g);
const date = moment.utc(title.match(/\w+ \d+\w+$/)[0], 'MMM Do');
if (date.isBefore()) {
// date is next year
release.date = date.add(1, 'year').toDate();
} else {
release.date = date.toDate();
}
release.poster = [
`https://inthecrack.com/assets/images/posters/collections/${release.entryId}.jpg`,
query.img('img', 'src', { origin: channel.url }),
];
return release;
});
}
function scrapeProfileScenes(items, actorName, channel) {
return items.map(({ query }) => {
const release = {};
if (slugify(query.cnt()) === 'no-other-collections') {
return null;
}
const details = query.cnts('figure p').reduce((acc, info) => {
const [key, value] = info.split(':');
return {
...acc,
[slugify(key, '_')]: value?.trim(),
};
}, {});
release.url = query.url('a', 'href', { origin: channel.url });
release.shootId = details.collection.match(/\d+/)[0];
release.entryId = release.shootId;
release.date = qu.parseDate(details.release_date, 'YYYY-MM-DD');
release.actors = [actorName];
/* rely on clip length
const durationString = Object.keys(details).find(info => /\d+_min_video/.test(info));
release.duration = durationString && Number(durationString.match(/^\d+/)?.[0]) * 60;
*/
release.productionLocation = details.shoot_location;
release.poster = [
`https://inthecrack.com/assets/images/posters/collections/${release.entryId}.jpg`,
query.img('img', 'src', { origin: channel.url }),
];
return release;
}).filter(Boolean);
}
function scrapeProfile({ query }, actorName, actorAvatar, channel, releasesFromScene) {
const profile = {};
const bio = query.cnts(releasesFromScene ? 'ul li' : 'div.modelInfo li').reduce((acc, info) => {
const [key, value] = info.split(':');
return {
...acc,
[slugify(key, '_')]: value.trim(),
};
}, {});
profile.name = actorName || bio.name;
profile.gender = 'female';
profile.birthPlace = bio.nationality;
if (bio.height) profile.height = feetInchesToCm(bio.height);
if (bio.weight) profile.weight = lbsToKg(bio.weight);
profile.releases = releasesFromScene?.[profile.name] || scrapeProfileScenes(qu.initAll(query.all('.Models li')), actorName, channel);
// avatar is the poster of a scene, find scene and use its high quality poster instead
const avatarRelease = profile.releases.find(release => new URL(release.poster[1]).pathname === new URL(actorAvatar).pathname);
profile.avatar = avatarRelease?.poster[0];
return profile;
}
async function fetchSceneActors(entryId, _release, channel) {
const url = `https://inthecrack.com/Collection/Biography/${entryId}`;
const res = await qu.get(url);
if (res.ok) {
const actorTabs = qu.initAll(res.item.query.all('#ModelTabs li')).map(({ query }) => ({
name: query.cnt('a'),
id: query.q('a', 'data-model'),
}));
const actorReleasesByActorName = actorTabs.reduce((acc, { name, id }) => {
const releaseEls = qu.initAll(res.item.query.all(`#Model-${id} li`));
const releases = scrapeProfileScenes(releaseEls, name, channel);
return {
...acc,
[name]: releases,
};
}, {});
const actors = qu.initAll(res.item.query.all('.modelInfo > li')).map((item) => {
const avatar = item.query.img('img', 'src', { origin: channel.url });
const profile = scrapeProfile(item, null, avatar, channel, actorReleasesByActorName);
return profile;
});
return actors;
}
return null;
}
async function scrapeScene({ query, html }, url, channel) {
const release = {};
const entryId = new URL(url).pathname.match(/\/Collection\/(\d+)/)[1];
release.shootId = query.cnt('h2 span').match(/^\d+/)?.[0];
release.entryId = release.shootId; // site entry ID can't be matched with upcoming scenes
const actors = await fetchSceneActors(entryId, release, channel);
release.actors = actors || query.cnt('h2 span')?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g);
release.description = query.cnt('p#CollectionDescription'); release.description = query.cnt('p#CollectionDescription');
release.productionLocation = query.cnt('.modelCollectionHeader p')?.match(/Shoot Location: (.*)/)?.[1]; release.productionLocation = query.cnt('.modelCollectionHeader p')?.match(/Shoot Location: (.*)/)?.[1];
@ -67,22 +201,6 @@ function scrapeScene({ query, html }, url, channel) {
return release; return release;
} }
function scrapeProfile({ query, el }, actorName, entity, include) {
const profile = {};
profile.description = query.cnt('.bio-text');
profile.birthPlace = query.cnt('.birth-place span');
profile.avatar = query.img('.actor-photo img');
if (include.releases) {
return scrapeAll(qu.initAll(el, '.scene'));
}
console.log(profile);
return profile;
}
async function fetchLatest(channel, page = 1) { async function fetchLatest(channel, page = 1) {
const year = moment().subtract(page - 1, ' year').year(); const year = moment().subtract(page - 1, ' year').year();
@ -96,6 +214,16 @@ async function fetchLatest(channel, page = 1) {
return res.status; return res.status;
} }
async function fetchUpcoming(channel) {
const res = await qu.getAll(channel.url, '#ComingSoon li');
if (res.ok) {
return scrapeUpcoming(res.items, channel);
}
return res.status;
}
async function fetchScene(url, channel) { async function fetchScene(url, channel) {
const res = await qu.get(url); const res = await qu.get(url);
@ -106,12 +234,27 @@ async function fetchScene(url, channel) {
return res.status; return res.status;
} }
async function fetchProfile({ name: actorName }, entity, include) { async function fetchProfile({ name: actorName }, channel, _include) {
const url = `${entity.url}/actors/${slugify(actorName, '_')}`; const firstLetter = actorName.charAt(0).toUpperCase();
const res = await qu.get(url); const url = `${channel.url}/Collections/Name/${firstLetter}`;
const res = await qu.getAll(url, '.collectionGridLayout li');
if (res.ok) { if (res.ok) {
return scrapeProfile(res.item, actorName, entity, include); const actorItem = res.items.find(({ query }) => slugify(query.cnt('span')) === slugify(actorName));
if (actorItem) {
const actorUrl = actorItem.query.url('a', 'href', { origin: channel.url });
const actorAvatar = actorItem.query.img('img', 'src', { origin: channel.url });
const actorRes = await qu.get(actorUrl);
if (actorRes.ok) {
return scrapeProfile(actorRes.item, actorName, actorAvatar, channel);
}
return actorRes.status;
}
return null;
} }
return res.status; return res.status;
@ -119,6 +262,7 @@ async function fetchProfile({ name: actorName }, entity, include) {
module.exports = { module.exports = {
fetchLatest, fetchLatest,
fetchUpcoming,
fetchScene, fetchScene,
// fetchProfile, fetchProfile,
}; };

View File

@ -197,6 +197,7 @@ module.exports = {
iconmale, iconmale,
interracialpass: hush, interracialpass: hush,
interracialpovs: hush, interracialpovs: hush,
inthecrack,
jamesdeen: fullpornnetwork, jamesdeen: fullpornnetwork,
julesjordan, julesjordan,
kellymadison, kellymadison,

View File

@ -263,7 +263,7 @@ async function storeClips(releases) {
clip: clip.clip, clip: clip.clip,
})); }));
const storedClips = await bulkInsert('clips', curatedClipEntries); const storedClips = await bulkInsert('clips', curatedClipEntries, ['release_id', 'clip']);
const clipIdsByReleaseIdAndClip = storedClips.reduce((acc, clip) => ({ const clipIdsByReleaseIdAndClip = storedClips.reduce((acc, clip) => ({
...acc, ...acc,
[clip.release_id]: { [clip.release_id]: {