forked from DebaucheryLibrarian/traxxx
Removed site and tag matching from Dogfart scraper. Filtering duplicate tag associations before insert attempt.
This commit is contained in:
parent
0b84c977da
commit
daee426ba6
|
@ -134,7 +134,7 @@
|
||||||
<ul class="tags nolist">
|
<ul class="tags nolist">
|
||||||
<li
|
<li
|
||||||
v-for="tag in release.tags"
|
v-for="tag in release.tags"
|
||||||
:key="`tag-${tag.id}`"
|
:key="`tag-${tag.slug}`"
|
||||||
class="tag"
|
class="tag"
|
||||||
>
|
>
|
||||||
<a
|
<a
|
||||||
|
|
|
@ -111,7 +111,7 @@
|
||||||
>
|
>
|
||||||
<li
|
<li
|
||||||
v-for="tag in release.tags"
|
v-for="tag in release.tags"
|
||||||
:key="`tag-${tag.id}`"
|
:key="`tag-${tag.slug}`"
|
||||||
class="tag"
|
class="tag"
|
||||||
>
|
>
|
||||||
<router-link
|
<router-link
|
||||||
|
|
|
@ -6,9 +6,6 @@ const bhttp = require('bhttp');
|
||||||
const { JSDOM } = require('jsdom');
|
const { JSDOM } = require('jsdom');
|
||||||
const moment = require('moment');
|
const moment = require('moment');
|
||||||
|
|
||||||
const knex = require('../knex');
|
|
||||||
const { matchTags } = require('../tags');
|
|
||||||
|
|
||||||
async function getPhoto(url) {
|
async function getPhoto(url) {
|
||||||
const res = await bhttp.get(url);
|
const res = await bhttp.get(url);
|
||||||
const html = res.body.toString();
|
const html = res.body.toString();
|
||||||
|
@ -92,7 +89,7 @@ async function scrapeScene(html, url, site) {
|
||||||
.replace('...read more', '')
|
.replace('...read more', '')
|
||||||
.trim();
|
.trim();
|
||||||
|
|
||||||
const siteSlug = document.querySelector('.site-name').textContent.split('.')[0].toLowerCase();
|
const channel = document.querySelector('.site-name').textContent.split('.')[0].toLowerCase();
|
||||||
const date = new Date(document.querySelector('meta[itemprop="uploadDate"]').content);
|
const date = new Date(document.querySelector('meta[itemprop="uploadDate"]').content);
|
||||||
const duration = moment
|
const duration = moment
|
||||||
.duration(`00:${document
|
.duration(`00:${document
|
||||||
|
@ -110,17 +107,7 @@ async function scrapeScene(html, url, site) {
|
||||||
const photos = await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, site, url);
|
const photos = await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, site, url);
|
||||||
|
|
||||||
const stars = Math.floor(Number(document.querySelector('span[itemprop="average"]').textContent) / 2);
|
const stars = Math.floor(Number(document.querySelector('span[itemprop="average"]').textContent) / 2);
|
||||||
const rawTags = Array.from(document.querySelectorAll('.scene-details .categories a')).map(({ textContent }) => textContent);
|
const tags = Array.from(document.querySelectorAll('.scene-details .categories a')).map(({ textContent }) => textContent);
|
||||||
|
|
||||||
const [channelSite, tags] = await Promise.all([
|
|
||||||
site.isFallback
|
|
||||||
? knex('sites')
|
|
||||||
.where({ slug: siteSlug })
|
|
||||||
.orWhere({ url: `https://${siteSlug}.com` })
|
|
||||||
.first()
|
|
||||||
: site,
|
|
||||||
matchTags(rawTags),
|
|
||||||
]);
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url: `${origin}${pathname}`,
|
url: `${origin}${pathname}`,
|
||||||
|
@ -138,7 +125,8 @@ async function scrapeScene(html, url, site) {
|
||||||
rating: {
|
rating: {
|
||||||
stars,
|
stars,
|
||||||
},
|
},
|
||||||
site: channelSite || site,
|
site,
|
||||||
|
channel,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -120,17 +120,14 @@ async function fetchSitesFromArgv() {
|
||||||
async function fetchSitesFromConfig() {
|
async function fetchSitesFromConfig() {
|
||||||
const included = destructConfigNetworks(config.include);
|
const included = destructConfigNetworks(config.include);
|
||||||
|
|
||||||
const networks = await knex('networks').select('id').whereIn('slug', included.networks || []);
|
|
||||||
const networkIds = networks.map(network => network.id);
|
|
||||||
|
|
||||||
const rawSites = await knex('sites')
|
const rawSites = await knex('sites')
|
||||||
.select(
|
.select(
|
||||||
'sites.*',
|
'sites.*',
|
||||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||||
)
|
)
|
||||||
|
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||||
.whereIn('sites.slug', included.sites || [])
|
.whereIn('sites.slug', included.sites || [])
|
||||||
.orWhereIn('network_id', networkIds)
|
.orWhereIn('networks.slug', included.networks || []);
|
||||||
.leftJoin('networks', 'sites.network_id', 'networks.id');
|
|
||||||
|
|
||||||
const curatedSites = await curateSites(rawSites, true);
|
const curatedSites = await curateSites(rawSites, true);
|
||||||
logger.info(`Found ${curatedSites.length} sites in database`);
|
logger.info(`Found ${curatedSites.length} sites in database`);
|
||||||
|
|
|
@ -42,7 +42,6 @@ async function matchTags(rawTags) {
|
||||||
const tagEntries = await knex('tags')
|
const tagEntries = await knex('tags')
|
||||||
.pluck('aliases.id')
|
.pluck('aliases.id')
|
||||||
.whereIn('tags.name', tags)
|
.whereIn('tags.name', tags)
|
||||||
.orWhereIn('tags.slug', tags)
|
|
||||||
.leftJoin('tags as aliases', function join() {
|
.leftJoin('tags as aliases', function join() {
|
||||||
this
|
this
|
||||||
.on('tags.alias_for', 'aliases.id')
|
.on('tags.alias_for', 'aliases.id')
|
||||||
|
@ -66,7 +65,7 @@ async function associateTags(release, releaseId) {
|
||||||
? await matchTags(release.tags) // scraper returned raw tags
|
? await matchTags(release.tags) // scraper returned raw tags
|
||||||
: rawReleaseTags; // tags already matched by (outdated) scraper
|
: rawReleaseTags; // tags already matched by (outdated) scraper
|
||||||
|
|
||||||
const tags = releaseTags.concat(siteTags);
|
const tags = Array.from(new Set(releaseTags.concat(siteTags)));
|
||||||
|
|
||||||
if (tags.length === 0) {
|
if (tags.length === 0) {
|
||||||
logger.info(`No tags available for (${release.site.name}, ${releaseId}) "${release.title}"`);
|
logger.info(`No tags available for (${release.site.name}, ${releaseId}) "${release.title}"`);
|
||||||
|
|
Loading…
Reference in New Issue