Added chapters and shoot location. Added In The Crack.
This commit is contained in:
@@ -608,7 +608,7 @@ async function scrapeActors(argNames) {
|
||||
|
||||
logger.info(`Scraping profiles for ${actorNames.length} actors`);
|
||||
|
||||
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
|
||||
const sources = argv.actorsSources || config.profiles || Object.keys(scrapers.actors);
|
||||
const entitySlugs = sources.flat();
|
||||
|
||||
const [entities, existingActorEntries] = await Promise.all([
|
||||
|
||||
@@ -617,7 +617,7 @@ async function storeMedias(baseMedias) {
|
||||
return [...newMediaWithEntries, ...existingHashMedias];
|
||||
}
|
||||
|
||||
async function associateReleaseMedia(releases, type = 'releases') {
|
||||
async function associateReleaseMedia(releases, type = 'release') {
|
||||
if (!argv.media) {
|
||||
return;
|
||||
}
|
||||
@@ -664,7 +664,7 @@ async function associateReleaseMedia(releases, type = 'releases') {
|
||||
|
||||
if (media) {
|
||||
acc.push({
|
||||
release_id: releaseId,
|
||||
[`${type}_id`]: releaseId,
|
||||
media_id: media.use || media.entry.id,
|
||||
});
|
||||
}
|
||||
@@ -675,7 +675,7 @@ async function associateReleaseMedia(releases, type = 'releases') {
|
||||
.filter(Boolean);
|
||||
|
||||
if (associations.length > 0) {
|
||||
await bulkInsert(`${type}_${role}`, associations, false);
|
||||
await bulkInsert(`${type}s_${role}`, associations, false);
|
||||
}
|
||||
}, Promise.resolve());
|
||||
}
|
||||
|
||||
124
src/scrapers/inthecrack.js
Normal file
124
src/scrapers/inthecrack.js
Normal file
@@ -0,0 +1,124 @@
|
||||
'use strict';
|
||||
|
||||
const moment = require('moment');
|
||||
|
||||
const qu = require('../utils/q');
|
||||
const slugify = require('../utils/slugify');
|
||||
|
||||
function scrapeAll(scenes, channel) {
|
||||
return scenes.map(({ query }) => {
|
||||
const release = {};
|
||||
|
||||
release.url = query.url('a', 'href', { origin: channel.url });
|
||||
release.entryId = new URL(release.url).pathname.match(/\/Collection\/(\d+)/)[1];
|
||||
|
||||
release.shootId = query.cnt('a span:nth-of-type(1)').match(/^\d+/)?.[0];
|
||||
release.date = query.date('a span:nth-of-type(2)', 'YYYY-MM-DD');
|
||||
|
||||
release.actors = (query.q('a img', 'alt') || query.cnt('a span:nth-of-type(1)'))?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g);
|
||||
|
||||
release.poster = release.shootId
|
||||
? `https://inthecrack.com/assets/images/posters/collections/${release.shootId}.jpg`
|
||||
: query.img('a img', 'src', { origin: channel.url });
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
|
||||
function scrapeScene({ query, html }, url, channel) {
|
||||
const release = {};
|
||||
|
||||
release.entryId = new URL(url).pathname.match(/\/Collection\/(\d+)/)[1];
|
||||
release.shootId = query.cnt('h2 span').match(/^\d+/)?.[0];
|
||||
|
||||
release.actors = query.cnt('h2 span')?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g);
|
||||
|
||||
release.description = query.cnt('p#CollectionDescription');
|
||||
release.productionLocation = query.cnt('.modelCollectionHeader p')?.match(/Shoot Location: (.*)/)?.[1];
|
||||
|
||||
release.poster = qu.prefixUrl(html.match(/background-image: url\('(.*)'\)/)?.[1], channel.url);
|
||||
|
||||
release.chapters = query.all('.ClipOuter').map((el) => {
|
||||
const chapter = {};
|
||||
|
||||
chapter.title = query.text(el, 'h4');
|
||||
chapter.description = query.cnt(el, 'p');
|
||||
chapter.duration = query.dur(el, '.InlineDuration');
|
||||
|
||||
const posterStyle = query.style(el, '.clipImage', 'background-image');
|
||||
const poster = qu.prefixUrl(posterStyle.match(/url\((.*)\)/)?.[1], channel.url);
|
||||
|
||||
if (poster) {
|
||||
const { origin, pathname } = new URL(poster);
|
||||
|
||||
chapter.poster = [
|
||||
`${origin}${pathname}`, // full size
|
||||
poster,
|
||||
];
|
||||
}
|
||||
|
||||
if (query.exists(el, '.ThreeDInfo')) {
|
||||
chapter.tags = ['3d'];
|
||||
}
|
||||
|
||||
return chapter;
|
||||
});
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
function scrapeProfile({ query, el }, actorName, entity, include) {
|
||||
const profile = {};
|
||||
|
||||
profile.description = query.cnt('.bio-text');
|
||||
profile.birthPlace = query.cnt('.birth-place span');
|
||||
|
||||
profile.avatar = query.img('.actor-photo img');
|
||||
|
||||
if (include.releases) {
|
||||
return scrapeAll(qu.initAll(el, '.scene'));
|
||||
}
|
||||
|
||||
console.log(profile);
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page = 1) {
|
||||
const year = moment().subtract(page - 1, ' year').year();
|
||||
|
||||
const url = `${channel.url}/Collections/Date/${year}`;
|
||||
const res = await qu.getAll(url, '.collectionGridLayout li');
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.items, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchScene(url, channel) {
|
||||
const res = await qu.get(url);
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeScene(res.item, url, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchProfile({ name: actorName }, entity, include) {
|
||||
const url = `${entity.url}/actors/${slugify(actorName, '_')}`;
|
||||
const res = await qu.get(url);
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeProfile(res.item, actorName, entity, include);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchScene,
|
||||
// fetchProfile,
|
||||
};
|
||||
@@ -27,6 +27,7 @@ const hitzefrei = require('./hitzefrei');
|
||||
const hush = require('./hush');
|
||||
const iconmale = require('./iconmale');
|
||||
const insex = require('./insex');
|
||||
const inthecrack = require('./inthecrack');
|
||||
const jayrock = require('./jayrock');
|
||||
const jesseloadsmonsterfacials = require('./jesseloadsmonsterfacials');
|
||||
const julesjordan = require('./julesjordan');
|
||||
@@ -108,6 +109,7 @@ module.exports = {
|
||||
hushpass: hush,
|
||||
insex,
|
||||
interracialpass: hush,
|
||||
inthecrack,
|
||||
jayrock,
|
||||
jesseloadsmonsterfacials,
|
||||
julesjordan,
|
||||
|
||||
@@ -7,13 +7,14 @@ const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const slugify = require('./utils/slugify');
|
||||
const bulkInsert = require('./utils/bulk-insert');
|
||||
const resolvePlace = require('./utils/resolve-place');
|
||||
const { formatDate } = require('./utils/qu');
|
||||
const { associateActors, scrapeActors } = require('./actors');
|
||||
const { associateReleaseTags } = require('./tags');
|
||||
const { curateEntity } = require('./entities');
|
||||
const { associateReleaseMedia } = require('./media');
|
||||
|
||||
function curateReleaseEntry(release, batchId, existingRelease, type = 'scene') {
|
||||
async function curateReleaseEntry(release, batchId, existingRelease, type = 'scene') {
|
||||
const slugBase = release.title
|
||||
|| (release.actors?.length && `${release.entity.slug} ${release.actors.map(actor => actor.name).join(' ')}`)
|
||||
|| (release.date && `${release.entity.slug} ${formatDate(release.date, 'YYYY MM DD')}`)
|
||||
@@ -50,6 +51,20 @@ function curateReleaseEntry(release, batchId, existingRelease, type = 'scene') {
|
||||
curatedRelease.duration = release.duration;
|
||||
}
|
||||
|
||||
if (release.productionLocation) {
|
||||
curatedRelease.production_location = release.productionLocation;
|
||||
|
||||
if (argv.resolvePlace) {
|
||||
const productionLocation = await resolvePlace(release.productionLocation);
|
||||
|
||||
if (productionLocation) {
|
||||
curatedRelease.production_city = productionLocation.city;
|
||||
curatedRelease.production_state = productionLocation.state;
|
||||
curatedRelease.production_country_alpha2 = productionLocation.country;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!existingRelease && !release.id) {
|
||||
curatedRelease.created_batch_id = batchId;
|
||||
}
|
||||
@@ -228,6 +243,46 @@ async function updateReleasesSearch(releaseIds) {
|
||||
}
|
||||
}
|
||||
|
||||
async function storeChapters(releases) {
|
||||
const chapters = releases.map(release => release.chapters?.map((chapter, index) => ({
|
||||
title: chapter.title,
|
||||
description: chapter.description,
|
||||
releaseId: release.id,
|
||||
chapter: index + 1,
|
||||
duration: chapter.duration,
|
||||
poster: chapter.poster,
|
||||
photos: chapter.photos,
|
||||
tags: chapter.tags,
|
||||
}))).flat().filter(Boolean);
|
||||
|
||||
const curatedChapterEntries = chapters.map(chapter => ({
|
||||
title: chapter.title,
|
||||
description: chapter.description,
|
||||
duration: chapter.duration,
|
||||
release_id: chapter.releaseId,
|
||||
chapter: chapter.chapter,
|
||||
}));
|
||||
|
||||
const storedChapters = await bulkInsert('chapters', curatedChapterEntries);
|
||||
const chapterIdsByReleaseIdAndChapter = storedChapters.reduce((acc, chapter) => ({
|
||||
...acc,
|
||||
[chapter.release_id]: {
|
||||
...acc[chapter.release_id],
|
||||
[chapter.chapter]: chapter.id,
|
||||
},
|
||||
}), {});
|
||||
|
||||
const chaptersWithId = chapters.map(chapter => ({
|
||||
...chapter,
|
||||
id: chapterIdsByReleaseIdAndChapter[chapter.releaseId][chapter.chapter],
|
||||
}));
|
||||
|
||||
await associateReleaseTags(chaptersWithId, 'chapter');
|
||||
|
||||
// media is more error-prone, associate separately
|
||||
await associateReleaseMedia(chaptersWithId, 'chapter');
|
||||
}
|
||||
|
||||
async function storeScenes(releases) {
|
||||
if (releases.length === 0) {
|
||||
return [];
|
||||
@@ -241,7 +296,7 @@ async function storeScenes(releases) {
|
||||
// uniqueness is entity ID + entry ID, filter uniques after adding entities
|
||||
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);
|
||||
|
||||
const curatedNewReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId));
|
||||
const curatedNewReleaseEntries = await Promise.all(uniqueReleases.map(release => curateReleaseEntry(release, batchId)));
|
||||
|
||||
const storedReleases = await bulkInsert('releases', curatedNewReleaseEntries);
|
||||
// TODO: update duplicate releases
|
||||
@@ -263,6 +318,8 @@ async function storeScenes(releases) {
|
||||
await scrapeActors(actors.map(actor => actor.name));
|
||||
}
|
||||
|
||||
await storeChapters(releasesWithId);
|
||||
|
||||
logger.info(`Stored ${storedReleaseEntries.length} releases`);
|
||||
|
||||
return releasesWithId;
|
||||
@@ -303,13 +360,13 @@ async function storeMovies(movies, movieScenes) {
|
||||
const { uniqueReleases } = await filterDuplicateReleases(movies);
|
||||
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
|
||||
|
||||
const curatedMovieEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId, null, 'movie'));
|
||||
const curatedMovieEntries = await Promise.all(uniqueReleases.map(release => curateReleaseEntry(release, batchId, null, 'movie')));
|
||||
|
||||
const storedMovies = await bulkInsert('movies', curatedMovieEntries, ['entity_id', 'entry_id'], true);
|
||||
const moviesWithId = attachReleaseIds(movies, storedMovies);
|
||||
|
||||
await associateMovieScenes(moviesWithId, movieScenes);
|
||||
await associateReleaseMedia(moviesWithId, 'movies');
|
||||
await associateReleaseMedia(moviesWithId, 'movie');
|
||||
|
||||
return storedMovies;
|
||||
}
|
||||
|
||||
36
src/tags.js
36
src/tags.js
@@ -2,6 +2,7 @@
|
||||
|
||||
const knex = require('./knex');
|
||||
const slugify = require('./utils/slugify');
|
||||
const bulkInsert = require('./utils/bulk-insert');
|
||||
|
||||
async function matchReleaseTags(releases) {
|
||||
const rawTags = releases
|
||||
@@ -28,7 +29,7 @@ async function matchReleaseTags(releases) {
|
||||
}
|
||||
|
||||
async function getEntityTags(releases) {
|
||||
const entityIds = releases.map(release => release.entity.id);
|
||||
const entityIds = releases.map(release => release.entity?.id).filter(Boolean);
|
||||
const entityTags = await knex('entities_tags').whereIn('entity_id', entityIds);
|
||||
|
||||
const entityTagIdsByEntityId = entityTags.reduce((acc, entityTag) => {
|
||||
@@ -44,10 +45,10 @@ async function getEntityTags(releases) {
|
||||
return entityTagIdsByEntityId;
|
||||
}
|
||||
|
||||
function buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntityId) {
|
||||
function buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntityId, type) {
|
||||
const tagAssociations = releases
|
||||
.map((release) => {
|
||||
const entityTagIds = entityTagIdsByEntityId[release.entity.id];
|
||||
const entityTagIds = entityTagIdsByEntityId[release.entity?.id] || [];
|
||||
const releaseTags = release.tags || [];
|
||||
|
||||
const releaseTagIds = releaseTags.every(tag => typeof tag === 'number')
|
||||
@@ -61,7 +62,7 @@ function buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntit
|
||||
.filter(Boolean),
|
||||
)]
|
||||
.map(tagId => ({
|
||||
release_id: release.id,
|
||||
[`${type}_id`]: release.id,
|
||||
tag_id: tagId,
|
||||
}));
|
||||
|
||||
@@ -72,34 +73,13 @@ function buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntit
|
||||
return tagAssociations;
|
||||
}
|
||||
|
||||
async function filterUniqueAssociations(tagAssociations) {
|
||||
const duplicateAssociations = await knex('releases_tags')
|
||||
.whereIn(['release_id', 'tag_id'], tagAssociations.map(association => [association.release_id, association.tag_id]));
|
||||
|
||||
const duplicateAssociationsByReleaseIdAndTagId = duplicateAssociations.reduce((acc, association) => {
|
||||
if (!acc[association.release_id]) {
|
||||
acc[association.release_id] = {};
|
||||
}
|
||||
|
||||
acc[association.release_id][association.tag_id] = true;
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
const uniqueAssociations = tagAssociations
|
||||
.filter(association => !duplicateAssociationsByReleaseIdAndTagId[association.release_id]?.[association.tag_id]);
|
||||
|
||||
return uniqueAssociations;
|
||||
}
|
||||
|
||||
async function associateReleaseTags(releases) {
|
||||
async function associateReleaseTags(releases, type = 'release') {
|
||||
const tagIdsBySlug = await matchReleaseTags(releases);
|
||||
const EntityTagIdsByEntityId = await getEntityTags(releases);
|
||||
|
||||
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, EntityTagIdsByEntityId);
|
||||
const uniqueAssociations = await filterUniqueAssociations(tagAssociations);
|
||||
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, EntityTagIdsByEntityId, type);
|
||||
|
||||
await knex('releases_tags').insert(uniqueAssociations);
|
||||
await bulkInsert(`${type}s_tags`, tagAssociations, false);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
|
||||
@@ -4,6 +4,10 @@ const knex = require('../knex');
|
||||
const chunk = require('./chunk');
|
||||
|
||||
async function bulkUpsert(table, items, conflict, update = true, chunkSize) {
|
||||
if (items.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const updated = (conflict === false && ':query ON CONFLICT DO NOTHING RETURNING *;')
|
||||
|| (conflict && update && `
|
||||
:query ON CONFLICT (${conflict})
|
||||
|
||||
@@ -20,7 +20,15 @@ async function resolvePlace(query) {
|
||||
const rawPlace = item.address;
|
||||
const place = {};
|
||||
|
||||
if (rawPlace.city) place.city = rawPlace.city;
|
||||
if (item.class === 'place' || item.class === 'boundary') {
|
||||
const location = rawPlace[item.type] || rawPlace.city || rawPlace.place;
|
||||
|
||||
if (location) {
|
||||
place.place = location;
|
||||
place.city = rawPlace.city || location;
|
||||
}
|
||||
}
|
||||
|
||||
if (rawPlace.state) place.state = rawPlace.state;
|
||||
if (rawPlace.country_code) place.country = rawPlace.country_code.toUpperCase();
|
||||
if (rawPlace.continent) place.continent = rawPlace.continent;
|
||||
|
||||
Reference in New Issue
Block a user