Experimentally scraping data from HardX.

This commit is contained in:
ThePendulum 2019-03-03 04:18:33 +01:00
parent cf8f299061
commit 71aa31dda6
3 changed files with 1771 additions and 67 deletions

1726
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -1,30 +1,41 @@
{
"name": "traxxx",
"version": "1.0.0",
"description": "All the latest porn releases in one place",
"main": "src/app.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"repository": {
"type": "git",
"url": "https://gitea.unknown.name/niels/traxxx.git"
},
"keywords": [
"porn",
"releases",
"updates",
"nsfw"
],
"author": "Niels Simenon",
"license": "ISC",
"devDependencies": {
"@babel/cli": "^7.2.3",
"@babel/core": "^7.3.4",
"@babel/preset-env": "^7.3.4",
"babel-preset-airbnb": "^3.2.0"
},
"dependencies": {
"config": "^3.0.1"
}
"name": "traxxx",
"version": "1.0.0",
"description": "All the latest porn releases in one place",
"main": "src/app.js",
"scripts": {
"start": "node src/app.js",
"eslint": "eslint src/",
"eslint-watch": "esw --watch src/"
},
"repository": {
"type": "git",
"url": "https://gitea.unknown.name/niels/traxxx.git"
},
"keywords": [
"porn",
"releases",
"updates",
"nsfw"
],
"author": "Niels Simenon",
"license": "ISC",
"devDependencies": {
"@babel/cli": "^7.2.3",
"@babel/core": "^7.3.4",
"@babel/preset-env": "^7.3.4",
"babel-eslint": "^10.0.1",
"babel-preset-airbnb": "^3.2.0",
"eslint": "^5.15.0",
"eslint-config-airbnb-base": "^13.1.0",
"eslint-plugin-import": "^2.16.0",
"eslint-watch": "^4.0.2"
},
"dependencies": {
"bhttp": "^1.2.4",
"cheerio": "^1.0.0-rc.2",
"config": "^3.0.1",
"date-fns": "^1.30.1",
"terminal-kit": "^1.27.0"
}
}

45
src/app.js Normal file
View File

@ -0,0 +1,45 @@
'use strict';
const bhttp = require('bhttp');
const cheerio = require('cheerio');
const { parse, format } = require('date-fns');
async function init() {
const baseUrl = 'https://www.hardx.com';
const res = await bhttp.get(`${baseUrl}/en/videos`, {});
const $ = cheerio.load(res.body.toString(), { normalizeWhitespace: true });
const scenesElements = $('.sceneInfo').toArray();
const scenes = scenesElements.map((element) => {
const sceneLinkElement = $(element).find('.sceneTitle a');
const url = `${baseUrl}${sceneLinkElement.attr('href')}`;
const title = sceneLinkElement.attr('title');
const [likes, dislikes] = $(element).find('.value')
.toArray()
.map(value => Number($(value).text()));
const stars = Math.floor(((likes * 5 + dislikes) / (likes + dislikes)) * 100) / 100;
const actors = $(element).find('.sceneActors a')
.map((actorIndex, actorElement) => $(actorElement).attr('title'))
.toArray();
const date = parse($(element).find('.sceneDate').text(), 'MM-DD-YYYY');
return {
url,
title,
actors,
date,
rating: {
likes,
dislikes,
stars,
},
};
});
console.log(scenes);
}
init();