Removed site and tag matching from Dogfart scraper. Filtering duplicate tag associations before insert attempt.

This commit is contained in:
ThePendulum 2020-01-17 01:55:54 +01:00
parent 0b84c977da
commit daee426ba6
5 changed files with 9 additions and 25 deletions

View File

@ -134,7 +134,7 @@
<ul class="tags nolist"> <ul class="tags nolist">
<li <li
v-for="tag in release.tags" v-for="tag in release.tags"
:key="`tag-${tag.id}`" :key="`tag-${tag.slug}`"
class="tag" class="tag"
> >
<a <a

View File

@ -111,7 +111,7 @@
> >
<li <li
v-for="tag in release.tags" v-for="tag in release.tags"
:key="`tag-${tag.id}`" :key="`tag-${tag.slug}`"
class="tag" class="tag"
> >
<router-link <router-link

View File

@ -6,9 +6,6 @@ const bhttp = require('bhttp');
const { JSDOM } = require('jsdom'); const { JSDOM } = require('jsdom');
const moment = require('moment'); const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags');
async function getPhoto(url) { async function getPhoto(url) {
const res = await bhttp.get(url); const res = await bhttp.get(url);
const html = res.body.toString(); const html = res.body.toString();
@ -92,7 +89,7 @@ async function scrapeScene(html, url, site) {
.replace('...read more', '') .replace('...read more', '')
.trim(); .trim();
const siteSlug = document.querySelector('.site-name').textContent.split('.')[0].toLowerCase(); const channel = document.querySelector('.site-name').textContent.split('.')[0].toLowerCase();
const date = new Date(document.querySelector('meta[itemprop="uploadDate"]').content); const date = new Date(document.querySelector('meta[itemprop="uploadDate"]').content);
const duration = moment const duration = moment
.duration(`00:${document .duration(`00:${document
@ -110,17 +107,7 @@ async function scrapeScene(html, url, site) {
const photos = await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, site, url); const photos = await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, site, url);
const stars = Math.floor(Number(document.querySelector('span[itemprop="average"]').textContent) / 2); const stars = Math.floor(Number(document.querySelector('span[itemprop="average"]').textContent) / 2);
const rawTags = Array.from(document.querySelectorAll('.scene-details .categories a')).map(({ textContent }) => textContent); const tags = Array.from(document.querySelectorAll('.scene-details .categories a')).map(({ textContent }) => textContent);
const [channelSite, tags] = await Promise.all([
site.isFallback
? knex('sites')
.where({ slug: siteSlug })
.orWhere({ url: `https://${siteSlug}.com` })
.first()
: site,
matchTags(rawTags),
]);
return { return {
url: `${origin}${pathname}`, url: `${origin}${pathname}`,
@ -138,7 +125,8 @@ async function scrapeScene(html, url, site) {
rating: { rating: {
stars, stars,
}, },
site: channelSite || site, site,
channel,
}; };
} }

View File

@ -120,17 +120,14 @@ async function fetchSitesFromArgv() {
async function fetchSitesFromConfig() { async function fetchSitesFromConfig() {
const included = destructConfigNetworks(config.include); const included = destructConfigNetworks(config.include);
const networks = await knex('networks').select('id').whereIn('slug', included.networks || []);
const networkIds = networks.map(network => network.id);
const rawSites = await knex('sites') const rawSites = await knex('sites')
.select( .select(
'sites.*', 'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
) )
.leftJoin('networks', 'sites.network_id', 'networks.id')
.whereIn('sites.slug', included.sites || []) .whereIn('sites.slug', included.sites || [])
.orWhereIn('network_id', networkIds) .orWhereIn('networks.slug', included.networks || []);
.leftJoin('networks', 'sites.network_id', 'networks.id');
const curatedSites = await curateSites(rawSites, true); const curatedSites = await curateSites(rawSites, true);
logger.info(`Found ${curatedSites.length} sites in database`); logger.info(`Found ${curatedSites.length} sites in database`);

View File

@ -42,7 +42,6 @@ async function matchTags(rawTags) {
const tagEntries = await knex('tags') const tagEntries = await knex('tags')
.pluck('aliases.id') .pluck('aliases.id')
.whereIn('tags.name', tags) .whereIn('tags.name', tags)
.orWhereIn('tags.slug', tags)
.leftJoin('tags as aliases', function join() { .leftJoin('tags as aliases', function join() {
this this
.on('tags.alias_for', 'aliases.id') .on('tags.alias_for', 'aliases.id')
@ -66,7 +65,7 @@ async function associateTags(release, releaseId) {
? await matchTags(release.tags) // scraper returned raw tags ? await matchTags(release.tags) // scraper returned raw tags
: rawReleaseTags; // tags already matched by (outdated) scraper : rawReleaseTags; // tags already matched by (outdated) scraper
const tags = releaseTags.concat(siteTags); const tags = Array.from(new Set(releaseTags.concat(siteTags)));
if (tags.length === 0) { if (tags.length === 0) {
logger.info(`No tags available for (${release.site.name}, ${releaseId}) "${release.title}"`); logger.info(`No tags available for (${release.site.name}, ${releaseId}) "${release.title}"`);