Added chapters and shoot location. Added In The Crack.

This commit is contained in:
DebaucheryLibrarian
2020-08-20 04:57:38 +02:00
parent fd4477bc50
commit 2835c66694
27 changed files with 471 additions and 52 deletions

View File

@@ -608,7 +608,7 @@ async function scrapeActors(argNames) {
logger.info(`Scraping profiles for ${actorNames.length} actors`);
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
const sources = argv.actorsSources || config.profiles || Object.keys(scrapers.actors);
const entitySlugs = sources.flat();
const [entities, existingActorEntries] = await Promise.all([

View File

@@ -617,7 +617,7 @@ async function storeMedias(baseMedias) {
return [...newMediaWithEntries, ...existingHashMedias];
}
async function associateReleaseMedia(releases, type = 'releases') {
async function associateReleaseMedia(releases, type = 'release') {
if (!argv.media) {
return;
}
@@ -664,7 +664,7 @@ async function associateReleaseMedia(releases, type = 'releases') {
if (media) {
acc.push({
release_id: releaseId,
[`${type}_id`]: releaseId,
media_id: media.use || media.entry.id,
});
}
@@ -675,7 +675,7 @@ async function associateReleaseMedia(releases, type = 'releases') {
.filter(Boolean);
if (associations.length > 0) {
await bulkInsert(`${type}_${role}`, associations, false);
await bulkInsert(`${type}s_${role}`, associations, false);
}
}, Promise.resolve());
}

124
src/scrapers/inthecrack.js Normal file
View File

@@ -0,0 +1,124 @@
'use strict';
const moment = require('moment');
const qu = require('../utils/q');
const slugify = require('../utils/slugify');
function scrapeAll(scenes, channel) {
return scenes.map(({ query }) => {
const release = {};
release.url = query.url('a', 'href', { origin: channel.url });
release.entryId = new URL(release.url).pathname.match(/\/Collection\/(\d+)/)[1];
release.shootId = query.cnt('a span:nth-of-type(1)').match(/^\d+/)?.[0];
release.date = query.date('a span:nth-of-type(2)', 'YYYY-MM-DD');
release.actors = (query.q('a img', 'alt') || query.cnt('a span:nth-of-type(1)'))?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g);
release.poster = release.shootId
? `https://inthecrack.com/assets/images/posters/collections/${release.shootId}.jpg`
: query.img('a img', 'src', { origin: channel.url });
return release;
});
}
function scrapeScene({ query, html }, url, channel) {
const release = {};
release.entryId = new URL(url).pathname.match(/\/Collection\/(\d+)/)[1];
release.shootId = query.cnt('h2 span').match(/^\d+/)?.[0];
release.actors = query.cnt('h2 span')?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g);
release.description = query.cnt('p#CollectionDescription');
release.productionLocation = query.cnt('.modelCollectionHeader p')?.match(/Shoot Location: (.*)/)?.[1];
release.poster = qu.prefixUrl(html.match(/background-image: url\('(.*)'\)/)?.[1], channel.url);
release.chapters = query.all('.ClipOuter').map((el) => {
const chapter = {};
chapter.title = query.text(el, 'h4');
chapter.description = query.cnt(el, 'p');
chapter.duration = query.dur(el, '.InlineDuration');
const posterStyle = query.style(el, '.clipImage', 'background-image');
const poster = qu.prefixUrl(posterStyle.match(/url\((.*)\)/)?.[1], channel.url);
if (poster) {
const { origin, pathname } = new URL(poster);
chapter.poster = [
`${origin}${pathname}`, // full size
poster,
];
}
if (query.exists(el, '.ThreeDInfo')) {
chapter.tags = ['3d'];
}
return chapter;
});
return release;
}
function scrapeProfile({ query, el }, actorName, entity, include) {
const profile = {};
profile.description = query.cnt('.bio-text');
profile.birthPlace = query.cnt('.birth-place span');
profile.avatar = query.img('.actor-photo img');
if (include.releases) {
return scrapeAll(qu.initAll(el, '.scene'));
}
console.log(profile);
return profile;
}
async function fetchLatest(channel, page = 1) {
const year = moment().subtract(page - 1, ' year').year();
const url = `${channel.url}/Collections/Date/${year}`;
const res = await qu.getAll(url, '.collectionGridLayout li');
if (res.ok) {
return scrapeAll(res.items, channel);
}
return res.status;
}
async function fetchScene(url, channel) {
const res = await qu.get(url);
if (res.ok) {
return scrapeScene(res.item, url, channel);
}
return res.status;
}
async function fetchProfile({ name: actorName }, entity, include) {
const url = `${entity.url}/actors/${slugify(actorName, '_')}`;
const res = await qu.get(url);
if (res.ok) {
return scrapeProfile(res.item, actorName, entity, include);
}
return res.status;
}
module.exports = {
fetchLatest,
fetchScene,
// fetchProfile,
};

View File

@@ -27,6 +27,7 @@ const hitzefrei = require('./hitzefrei');
const hush = require('./hush');
const iconmale = require('./iconmale');
const insex = require('./insex');
const inthecrack = require('./inthecrack');
const jayrock = require('./jayrock');
const jesseloadsmonsterfacials = require('./jesseloadsmonsterfacials');
const julesjordan = require('./julesjordan');
@@ -108,6 +109,7 @@ module.exports = {
hushpass: hush,
insex,
interracialpass: hush,
inthecrack,
jayrock,
jesseloadsmonsterfacials,
julesjordan,

View File

@@ -7,13 +7,14 @@ const logger = require('./logger')(__filename);
const knex = require('./knex');
const slugify = require('./utils/slugify');
const bulkInsert = require('./utils/bulk-insert');
const resolvePlace = require('./utils/resolve-place');
const { formatDate } = require('./utils/qu');
const { associateActors, scrapeActors } = require('./actors');
const { associateReleaseTags } = require('./tags');
const { curateEntity } = require('./entities');
const { associateReleaseMedia } = require('./media');
function curateReleaseEntry(release, batchId, existingRelease, type = 'scene') {
async function curateReleaseEntry(release, batchId, existingRelease, type = 'scene') {
const slugBase = release.title
|| (release.actors?.length && `${release.entity.slug} ${release.actors.map(actor => actor.name).join(' ')}`)
|| (release.date && `${release.entity.slug} ${formatDate(release.date, 'YYYY MM DD')}`)
@@ -50,6 +51,20 @@ function curateReleaseEntry(release, batchId, existingRelease, type = 'scene') {
curatedRelease.duration = release.duration;
}
if (release.productionLocation) {
curatedRelease.production_location = release.productionLocation;
if (argv.resolvePlace) {
const productionLocation = await resolvePlace(release.productionLocation);
if (productionLocation) {
curatedRelease.production_city = productionLocation.city;
curatedRelease.production_state = productionLocation.state;
curatedRelease.production_country_alpha2 = productionLocation.country;
}
}
}
if (!existingRelease && !release.id) {
curatedRelease.created_batch_id = batchId;
}
@@ -228,6 +243,46 @@ async function updateReleasesSearch(releaseIds) {
}
}
async function storeChapters(releases) {
const chapters = releases.map(release => release.chapters?.map((chapter, index) => ({
title: chapter.title,
description: chapter.description,
releaseId: release.id,
chapter: index + 1,
duration: chapter.duration,
poster: chapter.poster,
photos: chapter.photos,
tags: chapter.tags,
}))).flat().filter(Boolean);
const curatedChapterEntries = chapters.map(chapter => ({
title: chapter.title,
description: chapter.description,
duration: chapter.duration,
release_id: chapter.releaseId,
chapter: chapter.chapter,
}));
const storedChapters = await bulkInsert('chapters', curatedChapterEntries);
const chapterIdsByReleaseIdAndChapter = storedChapters.reduce((acc, chapter) => ({
...acc,
[chapter.release_id]: {
...acc[chapter.release_id],
[chapter.chapter]: chapter.id,
},
}), {});
const chaptersWithId = chapters.map(chapter => ({
...chapter,
id: chapterIdsByReleaseIdAndChapter[chapter.releaseId][chapter.chapter],
}));
await associateReleaseTags(chaptersWithId, 'chapter');
// media is more error-prone, associate separately
await associateReleaseMedia(chaptersWithId, 'chapter');
}
async function storeScenes(releases) {
if (releases.length === 0) {
return [];
@@ -241,7 +296,7 @@ async function storeScenes(releases) {
// uniqueness is entity ID + entry ID, filter uniques after adding entities
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);
const curatedNewReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId));
const curatedNewReleaseEntries = await Promise.all(uniqueReleases.map(release => curateReleaseEntry(release, batchId)));
const storedReleases = await bulkInsert('releases', curatedNewReleaseEntries);
// TODO: update duplicate releases
@@ -263,6 +318,8 @@ async function storeScenes(releases) {
await scrapeActors(actors.map(actor => actor.name));
}
await storeChapters(releasesWithId);
logger.info(`Stored ${storedReleaseEntries.length} releases`);
return releasesWithId;
@@ -303,13 +360,13 @@ async function storeMovies(movies, movieScenes) {
const { uniqueReleases } = await filterDuplicateReleases(movies);
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
const curatedMovieEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId, null, 'movie'));
const curatedMovieEntries = await Promise.all(uniqueReleases.map(release => curateReleaseEntry(release, batchId, null, 'movie')));
const storedMovies = await bulkInsert('movies', curatedMovieEntries, ['entity_id', 'entry_id'], true);
const moviesWithId = attachReleaseIds(movies, storedMovies);
await associateMovieScenes(moviesWithId, movieScenes);
await associateReleaseMedia(moviesWithId, 'movies');
await associateReleaseMedia(moviesWithId, 'movie');
return storedMovies;
}

View File

@@ -2,6 +2,7 @@
const knex = require('./knex');
const slugify = require('./utils/slugify');
const bulkInsert = require('./utils/bulk-insert');
async function matchReleaseTags(releases) {
const rawTags = releases
@@ -28,7 +29,7 @@ async function matchReleaseTags(releases) {
}
async function getEntityTags(releases) {
const entityIds = releases.map(release => release.entity.id);
const entityIds = releases.map(release => release.entity?.id).filter(Boolean);
const entityTags = await knex('entities_tags').whereIn('entity_id', entityIds);
const entityTagIdsByEntityId = entityTags.reduce((acc, entityTag) => {
@@ -44,10 +45,10 @@ async function getEntityTags(releases) {
return entityTagIdsByEntityId;
}
function buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntityId) {
function buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntityId, type) {
const tagAssociations = releases
.map((release) => {
const entityTagIds = entityTagIdsByEntityId[release.entity.id];
const entityTagIds = entityTagIdsByEntityId[release.entity?.id] || [];
const releaseTags = release.tags || [];
const releaseTagIds = releaseTags.every(tag => typeof tag === 'number')
@@ -61,7 +62,7 @@ function buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntit
.filter(Boolean),
)]
.map(tagId => ({
release_id: release.id,
[`${type}_id`]: release.id,
tag_id: tagId,
}));
@@ -72,34 +73,13 @@ function buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntit
return tagAssociations;
}
async function filterUniqueAssociations(tagAssociations) {
const duplicateAssociations = await knex('releases_tags')
.whereIn(['release_id', 'tag_id'], tagAssociations.map(association => [association.release_id, association.tag_id]));
const duplicateAssociationsByReleaseIdAndTagId = duplicateAssociations.reduce((acc, association) => {
if (!acc[association.release_id]) {
acc[association.release_id] = {};
}
acc[association.release_id][association.tag_id] = true;
return acc;
}, {});
const uniqueAssociations = tagAssociations
.filter(association => !duplicateAssociationsByReleaseIdAndTagId[association.release_id]?.[association.tag_id]);
return uniqueAssociations;
}
async function associateReleaseTags(releases) {
async function associateReleaseTags(releases, type = 'release') {
const tagIdsBySlug = await matchReleaseTags(releases);
const EntityTagIdsByEntityId = await getEntityTags(releases);
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, EntityTagIdsByEntityId);
const uniqueAssociations = await filterUniqueAssociations(tagAssociations);
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, EntityTagIdsByEntityId, type);
await knex('releases_tags').insert(uniqueAssociations);
await bulkInsert(`${type}s_tags`, tagAssociations, false);
}
module.exports = {

View File

@@ -4,6 +4,10 @@ const knex = require('../knex');
const chunk = require('./chunk');
async function bulkUpsert(table, items, conflict, update = true, chunkSize) {
if (items.length === 0) {
return [];
}
const updated = (conflict === false && ':query ON CONFLICT DO NOTHING RETURNING *;')
|| (conflict && update && `
:query ON CONFLICT (${conflict})

View File

@@ -20,7 +20,15 @@ async function resolvePlace(query) {
const rawPlace = item.address;
const place = {};
if (rawPlace.city) place.city = rawPlace.city;
if (item.class === 'place' || item.class === 'boundary') {
const location = rawPlace[item.type] || rawPlace.city || rawPlace.place;
if (location) {
place.place = location;
place.city = rawPlace.city || location;
}
}
if (rawPlace.state) place.state = rawPlace.state;
if (rawPlace.country_code) place.country = rawPlace.country_code.toUpperCase();
if (rawPlace.continent) place.continent = rawPlace.continent;