Splitting Han titles and actors in Model Media scraper.

This commit is contained in:
DebaucheryLibrarian 2023-08-02 02:14:41 +02:00
parent 5783507344
commit 8c1f1b69ff
22 changed files with 80 additions and 10 deletions

View File

@ -35,7 +35,10 @@
/>
<div class="info column">
<div class="row row-title">
<div
class="row row-title"
:class="{ 'has-alt': release.altTitles?.length > 0 }"
>
<h2
v-if="release.title"
class="title"
@ -62,6 +65,19 @@
/>
</div>
<div
v-if="release.altTitles?.length > 0"
class="row alttitles"
>
<h2
v-for="(altTitle, index) in release.altTitles"
:key="`altitle-${index}`"
class="alttitle"
>
{{ altTitle }}
</h2>
</div>
<Releases
v-if="release.scenes && release.scenes.length > 0"
:releases="release.scenes"
@ -520,6 +536,11 @@ export default {
color: var(--shadow);
}
.alttitle {
color: var(--shadow);
font-size: 1rem;
}
.album-toggle {
height: fit-content;
display: inline-flex;

View File

@ -545,6 +545,7 @@ const releaseFragment = `
release(id: $releaseId) {
id
title
altTitles
description
date
datePrecision

View File

@ -30,6 +30,9 @@ module.exports = {
},
},
},
location: {
userAgent: 'contact via https://traxxx.me/',
},
analytics: {
enabled: false,
address: 'http://localhost:3000/script.js',

View File

@ -0,0 +1,11 @@
exports.up = async (knex) => {
await knex.schema.alterTable('releases', (table) => {
table.specificType('alt_titles', 'text ARRAY');
});
};
exports.down = async (knex) => {
await knex.schema.alterTable('releases', (table) => {
table.dropColumn('alt_titles');
});
};

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.0 KiB

After

Width:  |  Height:  |  Size: 4.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.0 KiB

After

Width:  |  Height:  |  Size: 4.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 12 KiB

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 12 KiB

After

Width:  |  Height:  |  Size: 12 KiB

View File

@ -14,10 +14,38 @@ function scrapeAll(scenes) {
const { origin, pathname, searchParams } = new URL(url);
release.url = `${origin}${pathname}`;
release.actors = searchParams.get('models_name')?.split(',');
release.shootId = pathname.match(/((LA)|(LT)|(MA)|(MD)|(MM)|(MS)|(MT)|(RR))[\w-]+/)?.[0]; // pathname sometimes contains other text, match at least two letters to prevent false positives
release.actors = searchParams.get('models_name')?.split(',').map((actor) => {
const [han, english] = actor.split('/').map((name) => name.trim());
if (/amateur/i.test(english)) {
// not a name
return null;
}
return {
name: english || han,
alias: english && han,
};
}).filter(Boolean);
}
const rawTitle = query.content('.video-title div')?.replace(release.shootId, '');
if (rawTitle) {
// find / closest to Han in case there are multiple, account for no / at all
const hanIndex = rawTitle.match(/\p{Script_Extensions=Han}/u)?.index;
const splitIndex = rawTitle.slice(0, hanIndex).lastIndexOf('/') || hanIndex;
if (hanIndex && splitIndex > -1) {
release.title = rawTitle.slice(0, splitIndex).trim();
release.altTitles = [rawTitle.slice(splitIndex + 1).trim()];
} else {
release.title = rawTitle;
}
}
release.title = query.content('.video-title div');
release.duration = query.duration('.timestamp');
const poster = query.img('img', { attribute: 'data-src' });
@ -31,8 +59,6 @@ function scrapeAll(scenes) {
release.teaser = query.video(null, { attribute: 'data-video-src' });
console.log(release);
return release;
});
}
@ -49,17 +75,16 @@ function scrapeProfile({ query }) {
}
profile.description = query.content('h2') || null;
profile.height = query.number('//span[text()="Measurements"]/following-sibling::span', { match: /(\d+) cm/, matchIndex: 1 });
profile.height = query.number('//span[text()="Height"]/following-sibling::span', { match: /(\d+) cm/, matchIndex: 1 });
profile.weight = query.number('//span[text()="Weight"]/following-sibling::span', { match: /(\d+) kg/, matchIndex: 1 });
profile.measurements = query.number('//span[text()="Birth Place"]/following-sibling::span', { match: /(\d+) cm/, matchIndex: 1 });
// can't find a single profile wiht this information available, but add for good measure
profile.measurements = query.content('//span[text()="Measurements"]/following-sibling::span');
profile.birthPlace = query.number('//span[text()="Birth Place"]/following-sibling::span');
profile.banner = query.img('div[class*="banner"] > img');
profile.photos = query.imgs('#MusModelSwiper img');
console.log(profile);
return profile;
}

View File

@ -82,6 +82,7 @@ const scrapers = {
americanpornstar,
amateureuro: porndoe,
archangel,
asiam: modelmedia,
assylum,
aziani,
badoink,
@ -115,6 +116,7 @@ const scrapers = {
interracialpass: hush,
inthecrack,
jayrock,
jerkaoke: modelmedia,
jesseloadsmonsterfacials,
julesjordan,
karups,
@ -178,6 +180,7 @@ const scrapers = {
analviolation: fullpornnetwork,
anilos: nubiles,
archangel,
asiam: modelmedia,
aziani,
babes: mindgeek,
babevr: badoink,
@ -234,6 +237,7 @@ const scrapers = {
interracialpovs: hush,
inthecrack,
jamesdeen: fullpornnetwork,
jerkaoke: modelmedia,
julesjordan,
karups,
kellymadison,

View File

@ -32,6 +32,7 @@ async function curateReleaseEntry(release, batchId, existingRelease, type = 'sce
const curatedRelease = {
title: decode(release.title),
alt_titles: release.altTitles?.map((title) => decode(title)),
entry_id: release.entryId || null,
entity_id: release.entity.id,
studio_id: release.studio?.id || null,
@ -46,6 +47,8 @@ async function curateReleaseEntry(release, batchId, existingRelease, type = 'sce
updated_batch_id: batchId,
};
console.log(curatedRelease);
if (release.id) {
// release is updated
curatedRelease.id = release.id;

View File

@ -1,5 +1,7 @@
'use strict';
const config = require('config');
const knex = require('../knex');
const logger = require('../logger')(__filename);
const http = require('./http');
@ -27,7 +29,7 @@ async function resolvePlace(query) {
// https://operations.osmfoundation.org/policies/nominatim/
const res = await http.get(`https://nominatim.openstreetmap.org/search/${encodeURI(query)}?format=json&accept-language=en&addressdetails=1`, {
headers: {
'User-Agent': 'contact at moonloop.adult@protonmail.com',
'User-Agent': config.location.userAgent,
},
interval: 1000,
concurrency: 1,