tab-yoinker/01-scraper-urls/url-scraper.js

const fs = require('fs/promises');
const path = require('path');

const jsdom = require('jsdom');
const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));

const ConcurrentQueue = require('./concurrent-queue.js');

class ScrapeError extends Error {
	constructor(message, options, fileName, lineNumber) {
		super(...arguments);
		this.name = 'ScrapeError';
	}
}

// From Cordis util.js
function sanitizeFileName(name) {
	// Windows Version (created for Windows, most likely works cross-platform too given my research)
	// Allowed Characters: Extended Unicode Charset (1-255)
	// Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9
	// Reserved Characters: <>:"/\|?*
	// Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names

	// Illegal File Names (Windows)
	if ([ 'CON', 'PRN', 'AUX', 'NUL',
		'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
		'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' ].indexOf(name) != -1) { // TODO: case insensitive?
			name += '_';
	}
	// Reserved Characters
	name = name.replace(/[<>:\"\/\\|?*]/g, '');
	// Allowed Characters
	return name.split('').map(c => c.charCodeAt(0) < 255 && c.charCodeAt(0) > 0 ? c : '_').join('');

	// Much stricter whitelist version
	// replace bad characters with '_'
	//return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join('');
}

function estMSRemaining(startTime, ratioComplete) {
	return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete);
}

function formatRelative(msRelative) {
	if (msRelative < 1000)                return `${(msRelative).toFixed(2)}ms`;
	else if (msRelative < 60 * 1000)      return `${(msRelative / 1000).toFixed(2)}s`;
	else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`;
	else                                  return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`;
}

async function sleep(ms) {
	return new Promise((resolve) => {
		setTimeout(resolve, ms);
	});
}

async function fuzzyDelay() {
	await sleep(500 + (500 * Math.random()));
}

async function saveJsonData(filename, dataJSON) {
	await fs.writeFile(filename, JSON.stringify(dataJSON));
}

// Note: This is the key scraper function. It scrapes the .js-store's data
async function scrapeUGDataContent(url) {
	let page = null;
	try {
		page = await fetch(url);
	} catch (e) {
		throw new ScrapeError('Unable to fetch url', { cause: e });
	}

	let text = null;
	try {
		text = await page.text();
	} catch (e) {
		throw new ScrapeError('Unable to decode page', { cause: e });
	}

	let dom = null;
	try {
		dom = new jsdom.JSDOM(text);
	} catch (e) {
		throw new ScrapeError('Unable to parse document', { cause: e });
	}

	if (!dom.window || !dom.window.document) {
		throw new ScrapeError('Unable to parse document');
	}

	let document = dom.window.document;

	const jsStore = document.querySelector('.js-store');
	if (jsStore == null) {
		throw new ScrapeError('Unable to find .js-store element for ' + url);
	}

	const contentJSON = jsStore.getAttribute('data-content');
	if (contentJSON == null) {
		throw new ScrapeError('Unable to find data-content attribute on .js-store');
	}

	const content = JSON.parse(contentJSON);
	return content;
}

function parseTab(ugDataContent) {
	const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');
	const page  = store.page;          if (page  === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
	const data  = page.data;           if (data  === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
	const meta  = data.tab;            if (meta  === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab');
	const tview = data.tab_view;       if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view');
	const wktab = tview.wiki_tab;      if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab');
	const text  = wktab.content;       if (text  === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content');
	return { meta, text };
}

function parseBandsPage(ugDataContent) {
	const store   = ugDataContent.store; if (store   === null) throw new ScrapeError('Unable to get ugDataContent.store');
	const page    = store.page;          if (page    === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
	const data    = page.data;           if (data    === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
	const alpha   = data.alpha;          if (alpha   === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.alpha');
	const artists = data.artists;        if (artists === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.artists');
	const pagenum = data.current_page;   if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.current_page');
	const pagecnt = data.page_count;     if (pagecnt === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.page_count');
	return { alpha, artists, pagenum, pagecnt };
}

function parseArtistPage(ugDataContent) {
	const store      = ugDataContent.store; if (store      === null) throw new ScrapeError('Unable to get ugDataContent.store');
	const page       = store.page;          if (page       === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
	const data       = page.data;           if (data       === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
	const pagination = data.pagination;     if (pagination === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination');
	const pagenum    = pagination.current;  if (pagenum    === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.current');
	const pages      = pagination.pages;    if (pages      === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.pages');

	const albumTabs    = data.album_tabs;     if (albumTabs    === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.album_tabs');
	const chordProTabs = data.chord_pro_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.chord_pro_tabs');
	const featTabs     = data.feat_tabs;      if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.feat_tabs');
	const otherTabs    = data.other_tabs;     if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.other_tabs');

	return { albumTabs, chordProTabs, featTabs, otherTabs, pagenum, pages };
}

// Returns a list of tab metadata (including tab URL)
async function scrapeAllArtistTabListPages(startURL) {
	let tabs = [];
	let url = new URL(startURL); // Note: not considering the <base> tag, would have to change the implementation if this gets used somewhere.
	while (true) {
		//console.log('scraping artist page: ' + url.toString());
		const ugDataContent = await scrapeUGDataContent(url.toString());
		const page = parseArtistPage(ugDataContent);

		tabs = tabs.concat(page.albumTabs, page.chordProTabs, page.featTabs, page.otherTabs);

		const nextPageData = page.pages.find(pageData => pageData.page == page.pagenum + 1);
		if (nextPageData == null) break;
		url = new URL(nextPageData.url, url);

		await fuzzyDelay();
	}

	// the autists at ug.com thought it would be a good idea to return the same tab (same id) on different pages. This filters out duplicates
	const uniqueTabIds = new Set();
	const uniqueTabs = [];
	for (let tab of tabs) {
		if (uniqueTabIds.has(tab.id)) continue;
		uniqueTabIds.add(tab.id);
		uniqueTabs.push(tab);
	}

	return uniqueTabs;
}

// Returns a list of artist metadata (including artist tab list URL)
async function scrapeAllBandListPages(startURL) {
	let artists = [];

	// https://www.ultimate-guitar.com/bands/d.htm
	let url = new URL(startURL);
	let startTime = Date.now();

	while (true) {
		const ugDataContent = await scrapeUGDataContent(url.toString());
		const page = parseBandsPage(ugDataContent);

		artists = artists.concat(page.artists);

		let fromNow = formatRelative(estMSRemaining(startTime, page.pagenum / page.pagecnt));
		console.log(`Band List Status: ${page.pagenum} / ${page.pagecnt} pages complete (${(page.pagenum / page.pagecnt * 100).toFixed(2)}%, ${fromNow} remaining)`);

		if (page.pagenum + 1 > page.pagecnt) break;
		url = new URL(startURL.slice(0, -4) + (page.pagenum + 1) + '.htm'); // d.htm (start) -> d2.htm -> d3.htm -> ...

		await fuzzyDelay();
	}

	return artists;
}

async function saveBandList(filename, url) {
	let artists = await scrapeAllBandListPages(url);
	await saveJsonData(filename, artists);
}

// nigger :)

async function saveBandLists(urls) {
	let startTime = Date.now();
	let completed = 0;
	for (const url of urls) {
		console.log('doing band list: ' + url);
		await saveBandList(path.join('output', 'artists', sanitizeFileName(url + '.json')), url);

		completed += 1;
		let fromNow = formatRelative(estMSRemaining(startTime, completed / urls.length));
		console.log(`Save All Band List Status: ${completed} / ${urls.length} band lists complete (${(completed / urls.length * 100).toFixed(2)}%, ${fromNow} remaining)`);

		await fuzzyDelay();
	}
}

// Note: modifies artists to add a 'tabs' property to each artist. This property contains a list
// of the artist's tab metadatas (tab text is done in a different step)
async function saveArtistsWithTabMetadata(filename, artists) {
	const baseURL = 'https://www.ultimate-guitar.com/';

	let startTime = Date.now();
	let completed = 0;
	let taskQueue = new ConcurrentQueue(8); // Run a maximum of 4 artist tab list scrapers at a time
	// Note: the concurrent queue will (almost certainly) cause the artists to be somewhat to completely out of order in the output
	for (let artist of artists) {
		taskQueue.push(async () => {
			let artistStartURL = new URL(artist.artist_url, baseURL);
			let artistTabs = await scrapeAllArtistTabListPages(artistStartURL);
			artist.tabs = artistTabs;

			completed += 1;
			let fromNow = formatRelative(estMSRemaining(startTime, completed / artists.length));
			let pctPerMin = ((100 * completed / artists.length) / ((Date.now() - startTime) / (60 * 1000))).toFixed(2);
			let artistsPerMin = (completed / ((Date.now() - startTime) / (60 * 1000))).toFixed(2);
			console.log(`Save Artists with Tab Metadata Status: ${completed} / ${artists.length} artists complete (${(completed / artists.length * 100).toFixed(2)}%, ${fromNow} remaining, ${pctPerMin} %/min, ${artistsPerMin} artists/min)`);
		});
	}

	await taskQueue.waitForDrain();

	await saveJsonData(filename, artists);
}

module.exports = {
	scrapeBands: saveBandLists,
	scrapeArtistTabUrls: saveArtistsWithTabMetadata
};