tab-yoinker/01-scraper-urls/url-scraper.js
2021-09-23 23:15:03 -05:00

256 lines
11 KiB
JavaScript

const fs = require('fs/promises');
const path = require('path');
const jsdom = require('jsdom');
const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));
const ConcurrentQueue = require('./concurrent-queue.js');
class ScrapeError extends Error {
constructor(message, options, fileName, lineNumber) {
super(...arguments);
this.name = 'ScrapeError';
}
}
// From Cordis util.js
function sanitizeFileName(name) {
// Windows Version (created for Windows, most likely works cross-platform too given my research)
// Allowed Characters: Extended Unicode Charset (1-255)
// Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9
// Reserved Characters: <>:"/\|?*
// Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names
// Illegal File Names (Windows)
if ([ 'CON', 'PRN', 'AUX', 'NUL',
'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' ].indexOf(name) != -1) { // TODO: case insensitive?
name += '_';
}
// Reserved Characters
name = name.replace(/[<>:\"\/\\|?*]/g, '');
// Allowed Characters
return name.split('').map(c => c.charCodeAt(0) < 255 && c.charCodeAt(0) > 0 ? c : '_').join('');
// Much stricter whitelist version
// replace bad characters with '_'
//return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join('');
}
function estMSRemaining(startTime, ratioComplete) {
return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete);
}
function formatRelative(msRelative) {
if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`;
else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`;
else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`;
else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`;
}
async function sleep(ms) {
return new Promise((resolve) => {
setTimeout(resolve, ms);
});
}
async function fuzzyDelay() {
await sleep(500 + (500 * Math.random()));
}
async function saveJsonData(filename, dataJSON) {
await fs.writeFile(filename, JSON.stringify(dataJSON));
}
// Note: This is the key scraper function. It scrapes the .js-store's data
async function scrapeUGDataContent(url) {
let page = null;
try {
page = await fetch(url);
} catch (e) {
throw new ScrapeError('Unable to fetch url', { cause: e });
}
let text = null;
try {
text = await page.text();
} catch (e) {
throw new ScrapeError('Unable to decode page', { cause: e });
}
let dom = null;
try {
dom = new jsdom.JSDOM(text);
} catch (e) {
throw new ScrapeError('Unable to parse document', { cause: e });
}
if (!dom.window || !dom.window.document) {
throw new ScrapeError('Unable to parse document');
}
let document = dom.window.document;
const jsStore = document.querySelector('.js-store');
if (jsStore == null) {
throw new ScrapeError('Unable to find .js-store element for ' + url);
}
const contentJSON = jsStore.getAttribute('data-content');
if (contentJSON == null) {
throw new ScrapeError('Unable to find data-content attribute on .js-store');
}
const content = JSON.parse(contentJSON);
return content;
}
function parseTab(ugDataContent) {
const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');
const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab');
const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view');
const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab');
const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content');
return { meta, text };
}
function parseBandsPage(ugDataContent) {
const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');
const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
const alpha = data.alpha; if (alpha === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.alpha');
const artists = data.artists; if (artists === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.artists');
const pagenum = data.current_page; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.current_page');
const pagecnt = data.page_count; if (pagecnt === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.page_count');
return { alpha, artists, pagenum, pagecnt };
}
function parseArtistPage(ugDataContent) {
const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');
const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
const pagination = data.pagination; if (pagination === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination');
const pagenum = pagination.current; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.current');
const pages = pagination.pages; if (pages === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.pages');
const albumTabs = data.album_tabs; if (albumTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.album_tabs');
const chordProTabs = data.chord_pro_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.chord_pro_tabs');
const featTabs = data.feat_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.feat_tabs');
const otherTabs = data.other_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.other_tabs');
return { albumTabs, chordProTabs, featTabs, otherTabs, pagenum, pages };
}
// Returns a list of tab metadata (including tab URL)
async function scrapeAllArtistTabListPages(startURL) {
let tabs = [];
let url = new URL(startURL); // Note: not considering the <base> tag, would have to change the implementation if this gets used somewhere.
while (true) {
//console.log('scraping artist page: ' + url.toString());
const ugDataContent = await scrapeUGDataContent(url.toString());
const page = parseArtistPage(ugDataContent);
tabs = tabs.concat(page.albumTabs, page.chordProTabs, page.featTabs, page.otherTabs);
const nextPageData = page.pages.find(pageData => pageData.page == page.pagenum + 1);
if (nextPageData == null) break;
url = new URL(nextPageData.url, url);
await fuzzyDelay();
}
// the autists at ug.com thought it would be a good idea to return the same tab (same id) on different pages. This filters out duplicates
const uniqueTabIds = new Set();
const uniqueTabs = [];
for (let tab of tabs) {
if (uniqueTabIds.has(tab.id)) continue;
uniqueTabIds.add(tab.id);
uniqueTabs.push(tab);
}
return uniqueTabs;
}
// Returns a list of artist metadata (including artist tab list URL)
async function scrapeAllBandListPages(startURL) {
let artists = [];
// https://www.ultimate-guitar.com/bands/d.htm
let url = new URL(startURL);
let startTime = Date.now();
while (true) {
const ugDataContent = await scrapeUGDataContent(url.toString());
const page = parseBandsPage(ugDataContent);
artists = artists.concat(page.artists);
let fromNow = formatRelative(estMSRemaining(startTime, page.pagenum / page.pagecnt));
console.log(`Band List Status: ${page.pagenum} / ${page.pagecnt} pages complete (${(page.pagenum / page.pagecnt * 100).toFixed(2)}%, ${fromNow} remaining)`);
if (page.pagenum + 1 > page.pagecnt) break;
url = new URL(startURL.slice(0, -4) + (page.pagenum + 1) + '.htm'); // d.htm (start) -> d2.htm -> d3.htm -> ...
await fuzzyDelay();
}
return artists;
}
async function saveBandList(filename, url) {
let artists = await scrapeAllBandListPages(url);
await saveJsonData(filename, artists);
}
// nigger :)
async function saveBandLists(urls) {
let startTime = Date.now();
let completed = 0;
for (const url of urls) {
console.log('doing band list: ' + url);
await saveBandList(path.join('output', 'artists', sanitizeFileName(url + '.json')), url);
completed += 1;
let fromNow = formatRelative(estMSRemaining(startTime, completed / urls.length));
console.log(`Save All Band List Status: ${completed} / ${urls.length} band lists complete (${(completed / urls.length * 100).toFixed(2)}%, ${fromNow} remaining)`);
await fuzzyDelay();
}
}
// Note: modifies artists to add a 'tabs' property to each artist. This property contains a list
// of the artist's tab metadatas (tab text is done in a different step)
async function saveArtistsWithTabMetadata(filename, artists) {
const baseURL = 'https://www.ultimate-guitar.com/';
let startTime = Date.now();
let completed = 0;
let taskQueue = new ConcurrentQueue(8); // Run a maximum of 4 artist tab list scrapers at a time
// Note: the concurrent queue will (almost certainly) cause the artists to be somewhat to completely out of order in the output
for (let artist of artists) {
taskQueue.push(async () => {
let artistStartURL = new URL(artist.artist_url, baseURL);
let artistTabs = await scrapeAllArtistTabListPages(artistStartURL);
artist.tabs = artistTabs;
completed += 1;
let fromNow = formatRelative(estMSRemaining(startTime, completed / artists.length));
let pctPerMin = ((100 * completed / artists.length) / ((Date.now() - startTime) / (60 * 1000))).toFixed(2);
let artistsPerMin = (completed / ((Date.now() - startTime) / (60 * 1000))).toFixed(2);
console.log(`Save Artists with Tab Metadata Status: ${completed} / ${artists.length} artists complete (${(completed / artists.length * 100).toFixed(2)}%, ${fromNow} remaining, ${pctPerMin} %/min, ${artistsPerMin} artists/min)`);
});
}
await taskQueue.waitForDrain();
await saveJsonData(filename, artists);
}
module.exports = {
scrapeBands: saveBandLists,
scrapeArtistTabUrls: saveArtistsWithTabMetadata
};