Splitting Han titles and actors in Model Media scraper.

This commit is contained in:
DebaucheryLibrarian 2023-08-02 02:14:41 +02:00
parent 5783507344
commit 8c1f1b69ff
22 changed files with 80 additions and 10 deletions

View File

@ -35,7 +35,10 @@
/> />
<div class="info column"> <div class="info column">
<div class="row row-title"> <div
class="row row-title"
:class="{ 'has-alt': release.altTitles?.length > 0 }"
>
<h2 <h2
v-if="release.title" v-if="release.title"
class="title" class="title"
@ -62,6 +65,19 @@
/> />
</div> </div>
<div
v-if="release.altTitles?.length > 0"
class="row alttitles"
>
<h2
v-for="(altTitle, index) in release.altTitles"
:key="`altitle-${index}`"
class="alttitle"
>
{{ altTitle }}
</h2>
</div>
<Releases <Releases
v-if="release.scenes && release.scenes.length > 0" v-if="release.scenes && release.scenes.length > 0"
:releases="release.scenes" :releases="release.scenes"
@ -520,6 +536,11 @@ export default {
color: var(--shadow); color: var(--shadow);
} }
.alttitle {
color: var(--shadow);
font-size: 1rem;
}
.album-toggle { .album-toggle {
height: fit-content; height: fit-content;
display: inline-flex; display: inline-flex;

View File

@ -545,6 +545,7 @@ const releaseFragment = `
release(id: $releaseId) { release(id: $releaseId) {
id id
title title
altTitles
description description
date date
datePrecision datePrecision

View File

@ -30,6 +30,9 @@ module.exports = {
}, },
}, },
}, },
location: {
userAgent: 'contact via https://traxxx.me/',
},
analytics: { analytics: {
enabled: false, enabled: false,
address: 'http://localhost:3000/script.js', address: 'http://localhost:3000/script.js',

View File

@ -0,0 +1,11 @@
exports.up = async (knex) => {
await knex.schema.alterTable('releases', (table) => {
table.specificType('alt_titles', 'text ARRAY');
});
};
exports.down = async (knex) => {
await knex.schema.alterTable('releases', (table) => {
table.dropColumn('alt_titles');
});
};

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.0 KiB

After

Width:  |  Height:  |  Size: 4.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.0 KiB

After

Width:  |  Height:  |  Size: 4.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 12 KiB

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 12 KiB

After

Width:  |  Height:  |  Size: 12 KiB

View File

@ -14,10 +14,38 @@ function scrapeAll(scenes) {
const { origin, pathname, searchParams } = new URL(url); const { origin, pathname, searchParams } = new URL(url);
release.url = `${origin}${pathname}`; release.url = `${origin}${pathname}`;
release.actors = searchParams.get('models_name')?.split(','); release.shootId = pathname.match(/((LA)|(LT)|(MA)|(MD)|(MM)|(MS)|(MT)|(RR))[\w-]+/)?.[0]; // pathname sometimes contains other text, match at least two letters to prevent false positives
release.actors = searchParams.get('models_name')?.split(',').map((actor) => {
const [han, english] = actor.split('/').map((name) => name.trim());
if (/amateur/i.test(english)) {
// not a name
return null;
}
return {
name: english || han,
alias: english && han,
};
}).filter(Boolean);
}
const rawTitle = query.content('.video-title div')?.replace(release.shootId, '');
if (rawTitle) {
// find / closest to Han in case there are multiple, account for no / at all
const hanIndex = rawTitle.match(/\p{Script_Extensions=Han}/u)?.index;
const splitIndex = rawTitle.slice(0, hanIndex).lastIndexOf('/') || hanIndex;
if (hanIndex && splitIndex > -1) {
release.title = rawTitle.slice(0, splitIndex).trim();
release.altTitles = [rawTitle.slice(splitIndex + 1).trim()];
} else {
release.title = rawTitle;
}
} }
release.title = query.content('.video-title div');
release.duration = query.duration('.timestamp'); release.duration = query.duration('.timestamp');
const poster = query.img('img', { attribute: 'data-src' }); const poster = query.img('img', { attribute: 'data-src' });
@ -31,8 +59,6 @@ function scrapeAll(scenes) {
release.teaser = query.video(null, { attribute: 'data-video-src' }); release.teaser = query.video(null, { attribute: 'data-video-src' });
console.log(release);
return release; return release;
}); });
} }
@ -49,17 +75,16 @@ function scrapeProfile({ query }) {
} }
profile.description = query.content('h2') || null; profile.description = query.content('h2') || null;
profile.height = query.number('//span[text()="Measurements"]/following-sibling::span', { match: /(\d+) cm/, matchIndex: 1 }); profile.height = query.number('//span[text()="Height"]/following-sibling::span', { match: /(\d+) cm/, matchIndex: 1 });
profile.weight = query.number('//span[text()="Weight"]/following-sibling::span', { match: /(\d+) kg/, matchIndex: 1 }); profile.weight = query.number('//span[text()="Weight"]/following-sibling::span', { match: /(\d+) kg/, matchIndex: 1 });
profile.measurements = query.number('//span[text()="Birth Place"]/following-sibling::span', { match: /(\d+) cm/, matchIndex: 1 }); // can't find a single profile wiht this information available, but add for good measure
profile.measurements = query.content('//span[text()="Measurements"]/following-sibling::span');
profile.birthPlace = query.number('//span[text()="Birth Place"]/following-sibling::span'); profile.birthPlace = query.number('//span[text()="Birth Place"]/following-sibling::span');
profile.banner = query.img('div[class*="banner"] > img'); profile.banner = query.img('div[class*="banner"] > img');
profile.photos = query.imgs('#MusModelSwiper img'); profile.photos = query.imgs('#MusModelSwiper img');
console.log(profile);
return profile; return profile;
} }

View File

@ -82,6 +82,7 @@ const scrapers = {
americanpornstar, americanpornstar,
amateureuro: porndoe, amateureuro: porndoe,
archangel, archangel,
asiam: modelmedia,
assylum, assylum,
aziani, aziani,
badoink, badoink,
@ -115,6 +116,7 @@ const scrapers = {
interracialpass: hush, interracialpass: hush,
inthecrack, inthecrack,
jayrock, jayrock,
jerkaoke: modelmedia,
jesseloadsmonsterfacials, jesseloadsmonsterfacials,
julesjordan, julesjordan,
karups, karups,
@ -178,6 +180,7 @@ const scrapers = {
analviolation: fullpornnetwork, analviolation: fullpornnetwork,
anilos: nubiles, anilos: nubiles,
archangel, archangel,
asiam: modelmedia,
aziani, aziani,
babes: mindgeek, babes: mindgeek,
babevr: badoink, babevr: badoink,
@ -234,6 +237,7 @@ const scrapers = {
interracialpovs: hush, interracialpovs: hush,
inthecrack, inthecrack,
jamesdeen: fullpornnetwork, jamesdeen: fullpornnetwork,
jerkaoke: modelmedia,
julesjordan, julesjordan,
karups, karups,
kellymadison, kellymadison,

View File

@ -32,6 +32,7 @@ async function curateReleaseEntry(release, batchId, existingRelease, type = 'sce
const curatedRelease = { const curatedRelease = {
title: decode(release.title), title: decode(release.title),
alt_titles: release.altTitles?.map((title) => decode(title)),
entry_id: release.entryId || null, entry_id: release.entryId || null,
entity_id: release.entity.id, entity_id: release.entity.id,
studio_id: release.studio?.id || null, studio_id: release.studio?.id || null,
@ -46,6 +47,8 @@ async function curateReleaseEntry(release, batchId, existingRelease, type = 'sce
updated_batch_id: batchId, updated_batch_id: batchId,
}; };
console.log(curatedRelease);
if (release.id) { if (release.id) {
// release is updated // release is updated
curatedRelease.id = release.id; curatedRelease.id = release.id;

View File

@ -1,5 +1,7 @@
'use strict'; 'use strict';
const config = require('config');
const knex = require('../knex'); const knex = require('../knex');
const logger = require('../logger')(__filename); const logger = require('../logger')(__filename);
const http = require('./http'); const http = require('./http');
@ -27,7 +29,7 @@ async function resolvePlace(query) {
// https://operations.osmfoundation.org/policies/nominatim/ // https://operations.osmfoundation.org/policies/nominatim/
const res = await http.get(`https://nominatim.openstreetmap.org/search/${encodeURI(query)}?format=json&accept-language=en&addressdetails=1`, { const res = await http.get(`https://nominatim.openstreetmap.org/search/${encodeURI(query)}?format=json&accept-language=en&addressdetails=1`, {
headers: { headers: {
'User-Agent': 'contact at moonloop.adult@protonmail.com', 'User-Agent': config.location.userAgent,
}, },
interval: 1000, interval: 1000,
concurrency: 1, concurrency: 1,