these changes were sitting for a while

This commit is contained in:
Anonymous 2022-08-01 19:03:29 -05:00
parent 3763a6b711
commit 87a424494b
10 changed files with 1215 additions and 1206 deletions

18
.gitignore vendored
View File

@ -1,9 +1,9 @@
**/node_modules **/node_modules
**/input/**/*.db **/input/**/*.db
**/input/**/*.db-journal **/input/**/*.db-journal
**/input/**/*.json **/input/**/*.json
**/output/**/*.db **/output/**/*.db
**/output/**/*.db-journal **/output/**/*.db-journal
**/output/**/*.json **/output/**/*.json
**/output/**/*.txt **/output/**/*.txt
**/*.7z **/*.7z

View File

@ -1,36 +1,36 @@
const Scraper = require('./url-scraper.js'); const Scraper = require('./url-scraper.js');
// Comment out the urls that you have already scraped // Comment out the urls that you have already scraped
const bandListUrls = [ const bandListUrls = [
'https://www.ultimate-guitar.com/bands/0-9.htm', 'https://www.ultimate-guitar.com/bands/0-9.htm',
// 'https://www.ultimate-guitar.com/bands/a.htm', // 'https://www.ultimate-guitar.com/bands/a.htm',
// 'https://www.ultimate-guitar.com/bands/b.htm', // 'https://www.ultimate-guitar.com/bands/b.htm',
// 'https://www.ultimate-guitar.com/bands/c.htm', // 'https://www.ultimate-guitar.com/bands/c.htm',
// 'https://www.ultimate-guitar.com/bands/d.htm', // 'https://www.ultimate-guitar.com/bands/d.htm',
// 'https://www.ultimate-guitar.com/bands/e.htm', // 'https://www.ultimate-guitar.com/bands/e.htm',
// 'https://www.ultimate-guitar.com/bands/f.htm', // 'https://www.ultimate-guitar.com/bands/f.htm',
// 'https://www.ultimate-guitar.com/bands/g.htm', // 'https://www.ultimate-guitar.com/bands/g.htm',
// 'https://www.ultimate-guitar.com/bands/h.htm', // 'https://www.ultimate-guitar.com/bands/h.htm',
// 'https://www.ultimate-guitar.com/bands/i.htm', // 'https://www.ultimate-guitar.com/bands/i.htm',
// 'https://www.ultimate-guitar.com/bands/j.htm', // 'https://www.ultimate-guitar.com/bands/j.htm',
// 'https://www.ultimate-guitar.com/bands/k.htm', // 'https://www.ultimate-guitar.com/bands/k.htm',
// 'https://www.ultimate-guitar.com/bands/l.htm', // 'https://www.ultimate-guitar.com/bands/l.htm',
// 'https://www.ultimate-guitar.com/bands/m.htm', // 'https://www.ultimate-guitar.com/bands/m.htm',
// 'https://www.ultimate-guitar.com/bands/n.htm', // 'https://www.ultimate-guitar.com/bands/n.htm',
// 'https://www.ultimate-guitar.com/bands/o.htm', // 'https://www.ultimate-guitar.com/bands/o.htm',
// 'https://www.ultimate-guitar.com/bands/p.htm', // 'https://www.ultimate-guitar.com/bands/p.htm',
// 'https://www.ultimate-guitar.com/bands/q.htm', // 'https://www.ultimate-guitar.com/bands/q.htm',
// 'https://www.ultimate-guitar.com/bands/r.htm', // 'https://www.ultimate-guitar.com/bands/r.htm',
// 'https://www.ultimate-guitar.com/bands/s.htm', // 'https://www.ultimate-guitar.com/bands/s.htm',
// 'https://www.ultimate-guitar.com/bands/t.htm', // 'https://www.ultimate-guitar.com/bands/t.htm',
// 'https://www.ultimate-guitar.com/bands/u.htm', // 'https://www.ultimate-guitar.com/bands/u.htm',
// 'https://www.ultimate-guitar.com/bands/v.htm', // 'https://www.ultimate-guitar.com/bands/v.htm',
// 'https://www.ultimate-guitar.com/bands/w.htm', // 'https://www.ultimate-guitar.com/bands/w.htm',
// 'https://www.ultimate-guitar.com/bands/x.htm', // 'https://www.ultimate-guitar.com/bands/x.htm',
// 'https://www.ultimate-guitar.com/bands/y.htm', // 'https://www.ultimate-guitar.com/bands/y.htm',
// 'https://www.ultimate-guitar.com/bands/z.htm', // 'https://www.ultimate-guitar.com/bands/z.htm',
]; ];
(async () => { (async () => {
await Scraper.scrapeBands(bandListUrls); await Scraper.scrapeBands(bandListUrls);
})(); })();

View File

@ -1,43 +1,43 @@
const fs = require('fs/promises'); const fs = require('fs/promises');
const path = require('path'); const path = require('path');
const Scraper = require('./url-scraper.js'); const Scraper = require('./url-scraper.js');
// Comment out the artist files that you have already scraped // Comment out the artist files that you have already scraped
const artistListFiles = [ const artistListFiles = [
'output/artists/httpswww.ultimate-guitar.combands0-9.htm.json', 'output/artists/httpswww.ultimate-guitar.combands0-9.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsa.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsa.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsb.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsb.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsc.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsc.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsd.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsd.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandse.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandse.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsf.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsf.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsg.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsg.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsh.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsh.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsi.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsi.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsj.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsj.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsk.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsk.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsl.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsl.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsm.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsm.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsn.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsn.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandso.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandso.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsp.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsp.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsq.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsq.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsr.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsr.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandss.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandss.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandst.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandst.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsu.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsu.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsv.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsv.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsw.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsw.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsx.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsx.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsy.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsy.htm.json',
// 'output/artists/httpswww.ultimate-guitar.combandsz.htm.json', // 'output/artists/httpswww.ultimate-guitar.combandsz.htm.json',
]; ];
(async () => { (async () => {
let num = 0; let num = 0;
for (let file of artistListFiles) { for (let file of artistListFiles) {
let artists = JSON.parse(await fs.readFile(file)); let artists = JSON.parse(await fs.readFile(file));
await Scraper.scrapeArtistTabUrls(path.join('output', 'artists-with-tabs', 'artists-part-' + num + '.json'), artists); await Scraper.scrapeArtistTabUrls(path.join('output', 'artists-with-tabs', 'artists-part-' + num + '.json'), artists);
} }
})(); })();

View File

@ -1,48 +1,48 @@
// Runs a limited number of promises at one time // Runs a limited number of promises at one time
class ConcurrentQueue { class ConcurrentQueue {
constructor(consecutive) { constructor(consecutive) {
this.consecutive = consecutive; this.consecutive = consecutive;
this.queue = []; this.queue = [];
this.current = 0; this.current = 0;
this.drainListeners = []; this.drainListeners = [];
} }
_checkQueue() { _checkQueue() {
if (this.current == 0 && this.queue.length == 0) { if (this.current == 0 && this.queue.length == 0) {
for (let drainListener of this.drainListeners) { for (let drainListener of this.drainListeners) {
drainListener(); drainListener();
} }
this.drainListeners = []; this.drainListeners = [];
} }
while (this.current < this.consecutive && this.queue.length > 0) { while (this.current < this.consecutive && this.queue.length > 0) {
let taskData = this.queue.shift(); let taskData = this.queue.shift();
this.current += 1; this.current += 1;
(async () => { (async () => {
try { try {
taskData.resolve(await taskData.task()); taskData.resolve(await taskData.task());
} catch (e) { } catch (e) {
taskData.reject(e); taskData.reject(e);
} }
this.current -= 1; this.current -= 1;
this._checkQueue(); this._checkQueue();
})(); })();
} }
} }
// returns a promise that can be awaited to get the resolution or rejection of the task's execution // returns a promise that can be awaited to get the resolution or rejection of the task's execution
push(task) { push(task) {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
this.queue.push({ task, resolve, reject }) this.queue.push({ task, resolve, reject })
this._checkQueue(); this._checkQueue();
}); });
} }
async waitForDrain() { async waitForDrain() {
return new Promise((resolve) => { return new Promise((resolve) => {
this.drainListeners.push(resolve); this.drainListeners.push(resolve);
this._checkQueue(); this._checkQueue();
}); });
} }
} }
module.exports = ConcurrentQueue; module.exports = ConcurrentQueue;

View File

@ -1,255 +1,255 @@
const fs = require('fs/promises'); const fs = require('fs/promises');
const path = require('path'); const path = require('path');
const jsdom = require('jsdom'); const jsdom = require('jsdom');
const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args)); const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));
const ConcurrentQueue = require('./concurrent-queue.js'); const ConcurrentQueue = require('./concurrent-queue.js');
class ScrapeError extends Error { class ScrapeError extends Error {
constructor(message, options, fileName, lineNumber) { constructor(message, options, fileName, lineNumber) {
super(...arguments); super(...arguments);
this.name = 'ScrapeError'; this.name = 'ScrapeError';
} }
} }
// From Cordis util.js // From Cordis util.js
function sanitizeFileName(name) { function sanitizeFileName(name) {
// Windows Version (created for Windows, most likely works cross-platform too given my research) // Windows Version (created for Windows, most likely works cross-platform too given my research)
// Allowed Characters: Extended Unicode Charset (1-255) // Allowed Characters: Extended Unicode Charset (1-255)
// Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9 // Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9
// Reserved Characters: <>:"/\|?* // Reserved Characters: <>:"/\|?*
// Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names // Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names
// Illegal File Names (Windows) // Illegal File Names (Windows)
if ([ 'CON', 'PRN', 'AUX', 'NUL', if ([ 'CON', 'PRN', 'AUX', 'NUL',
'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' ].indexOf(name) != -1) { // TODO: case insensitive? 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' ].indexOf(name) != -1) { // TODO: case insensitive?
name += '_'; name += '_';
} }
// Reserved Characters // Reserved Characters
name = name.replace(/[<>:\"\/\\|?*]/g, ''); name = name.replace(/[<>:\"\/\\|?*]/g, '');
// Allowed Characters // Allowed Characters
return name.split('').map(c => c.charCodeAt(0) < 255 && c.charCodeAt(0) > 0 ? c : '_').join(''); return name.split('').map(c => c.charCodeAt(0) < 255 && c.charCodeAt(0) > 0 ? c : '_').join('');
// Much stricter whitelist version // Much stricter whitelist version
// replace bad characters with '_' // replace bad characters with '_'
//return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join(''); //return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join('');
} }
function estMSRemaining(startTime, ratioComplete) { function estMSRemaining(startTime, ratioComplete) {
return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete); return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete);
} }
function formatRelative(msRelative) { function formatRelative(msRelative) {
if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`; if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`;
else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`; else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`;
else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`; else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`;
else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`; else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`;
} }
async function sleep(ms) { async function sleep(ms) {
return new Promise((resolve) => { return new Promise((resolve) => {
setTimeout(resolve, ms); setTimeout(resolve, ms);
}); });
} }
async function fuzzyDelay() { async function fuzzyDelay() {
await sleep(500 + (500 * Math.random())); await sleep(500 + (500 * Math.random()));
} }
async function saveJsonData(filename, dataJSON) { async function saveJsonData(filename, dataJSON) {
await fs.writeFile(filename, JSON.stringify(dataJSON)); await fs.writeFile(filename, JSON.stringify(dataJSON));
} }
// Note: This is the key scraper function. It scrapes the .js-store's data // Note: This is the key scraper function. It scrapes the .js-store's data
async function scrapeUGDataContent(url) { async function scrapeUGDataContent(url) {
let page = null; let page = null;
try { try {
page = await fetch(url); page = await fetch(url);
} catch (e) { } catch (e) {
throw new ScrapeError('Unable to fetch url', { cause: e }); throw new ScrapeError('Unable to fetch url', { cause: e });
} }
let text = null; let text = null;
try { try {
text = await page.text(); text = await page.text();
} catch (e) { } catch (e) {
throw new ScrapeError('Unable to decode page', { cause: e }); throw new ScrapeError('Unable to decode page', { cause: e });
} }
let dom = null; let dom = null;
try { try {
dom = new jsdom.JSDOM(text); dom = new jsdom.JSDOM(text);
} catch (e) { } catch (e) {
throw new ScrapeError('Unable to parse document', { cause: e }); throw new ScrapeError('Unable to parse document', { cause: e });
} }
if (!dom.window || !dom.window.document) { if (!dom.window || !dom.window.document) {
throw new ScrapeError('Unable to parse document'); throw new ScrapeError('Unable to parse document');
} }
let document = dom.window.document; let document = dom.window.document;
const jsStore = document.querySelector('.js-store'); const jsStore = document.querySelector('.js-store');
if (jsStore == null) { if (jsStore == null) {
throw new ScrapeError('Unable to find .js-store element for ' + url); throw new ScrapeError('Unable to find .js-store element for ' + url);
} }
const contentJSON = jsStore.getAttribute('data-content'); const contentJSON = jsStore.getAttribute('data-content');
if (contentJSON == null) { if (contentJSON == null) {
throw new ScrapeError('Unable to find data-content attribute on .js-store'); throw new ScrapeError('Unable to find data-content attribute on .js-store');
} }
const content = JSON.parse(contentJSON); const content = JSON.parse(contentJSON);
return content; return content;
} }
function parseTab(ugDataContent) { function parseTab(ugDataContent) {
const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');
const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab'); const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab');
const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view'); const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view');
const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab'); const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab');
const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content'); const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content');
return { meta, text }; return { meta, text };
} }
function parseBandsPage(ugDataContent) { function parseBandsPage(ugDataContent) {
const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');
const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
const alpha = data.alpha; if (alpha === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.alpha'); const alpha = data.alpha; if (alpha === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.alpha');
const artists = data.artists; if (artists === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.artists'); const artists = data.artists; if (artists === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.artists');
const pagenum = data.current_page; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.current_page'); const pagenum = data.current_page; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.current_page');
const pagecnt = data.page_count; if (pagecnt === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.page_count'); const pagecnt = data.page_count; if (pagecnt === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.page_count');
return { alpha, artists, pagenum, pagecnt }; return { alpha, artists, pagenum, pagecnt };
} }
function parseArtistPage(ugDataContent) { function parseArtistPage(ugDataContent) {
const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');
const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
const pagination = data.pagination; if (pagination === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination'); const pagination = data.pagination; if (pagination === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination');
const pagenum = pagination.current; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.current'); const pagenum = pagination.current; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.current');
const pages = pagination.pages; if (pages === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.pages'); const pages = pagination.pages; if (pages === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.pages');
const albumTabs = data.album_tabs; if (albumTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.album_tabs'); const albumTabs = data.album_tabs; if (albumTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.album_tabs');
const chordProTabs = data.chord_pro_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.chord_pro_tabs'); const chordProTabs = data.chord_pro_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.chord_pro_tabs');
const featTabs = data.feat_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.feat_tabs'); const featTabs = data.feat_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.feat_tabs');
const otherTabs = data.other_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.other_tabs'); const otherTabs = data.other_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.other_tabs');
return { albumTabs, chordProTabs, featTabs, otherTabs, pagenum, pages }; return { albumTabs, chordProTabs, featTabs, otherTabs, pagenum, pages };
} }
// Returns a list of tab metadata (including tab URL) // Returns a list of tab metadata (including tab URL)
async function scrapeAllArtistTabListPages(startURL) { async function scrapeAllArtistTabListPages(startURL) {
let tabs = []; let tabs = [];
let url = new URL(startURL); // Note: not considering the <base> tag, would have to change the implementation if this gets used somewhere. let url = new URL(startURL); // Note: not considering the <base> tag, would have to change the implementation if this gets used somewhere.
while (true) { while (true) {
//console.log('scraping artist page: ' + url.toString()); //console.log('scraping artist page: ' + url.toString());
const ugDataContent = await scrapeUGDataContent(url.toString()); const ugDataContent = await scrapeUGDataContent(url.toString());
const page = parseArtistPage(ugDataContent); const page = parseArtistPage(ugDataContent);
tabs = tabs.concat(page.albumTabs, page.chordProTabs, page.featTabs, page.otherTabs); tabs = tabs.concat(page.albumTabs, page.chordProTabs, page.featTabs, page.otherTabs);
const nextPageData = page.pages.find(pageData => pageData.page == page.pagenum + 1); const nextPageData = page.pages.find(pageData => pageData.page == page.pagenum + 1);
if (nextPageData == null) break; if (nextPageData == null) break;
url = new URL(nextPageData.url, url); url = new URL(nextPageData.url, url);
await fuzzyDelay(); await fuzzyDelay();
} }
// the autists at ug.com thought it would be a good idea to return the same tab (same id) on different pages. This filters out duplicates // the autists at ug.com thought it would be a good idea to return the same tab (same id) on different pages. This filters out duplicates
const uniqueTabIds = new Set(); const uniqueTabIds = new Set();
const uniqueTabs = []; const uniqueTabs = [];
for (let tab of tabs) { for (let tab of tabs) {
if (uniqueTabIds.has(tab.id)) continue; if (uniqueTabIds.has(tab.id)) continue;
uniqueTabIds.add(tab.id); uniqueTabIds.add(tab.id);
uniqueTabs.push(tab); uniqueTabs.push(tab);
} }
return uniqueTabs; return uniqueTabs;
} }
// Returns a list of artist metadata (including artist tab list URL) // Returns a list of artist metadata (including artist tab list URL)
async function scrapeAllBandListPages(startURL) { async function scrapeAllBandListPages(startURL) {
let artists = []; let artists = [];
// https://www.ultimate-guitar.com/bands/d.htm // https://www.ultimate-guitar.com/bands/d.htm
let url = new URL(startURL); let url = new URL(startURL);
let startTime = Date.now(); let startTime = Date.now();
while (true) { while (true) {
const ugDataContent = await scrapeUGDataContent(url.toString()); const ugDataContent = await scrapeUGDataContent(url.toString());
const page = parseBandsPage(ugDataContent); const page = parseBandsPage(ugDataContent);
artists = artists.concat(page.artists); artists = artists.concat(page.artists);
let fromNow = formatRelative(estMSRemaining(startTime, page.pagenum / page.pagecnt)); let fromNow = formatRelative(estMSRemaining(startTime, page.pagenum / page.pagecnt));
console.log(`Band List Status: ${page.pagenum} / ${page.pagecnt} pages complete (${(page.pagenum / page.pagecnt * 100).toFixed(2)}%, ${fromNow} remaining)`); console.log(`Band List Status: ${page.pagenum} / ${page.pagecnt} pages complete (${(page.pagenum / page.pagecnt * 100).toFixed(2)}%, ${fromNow} remaining)`);
if (page.pagenum + 1 > page.pagecnt) break; if (page.pagenum + 1 > page.pagecnt) break;
url = new URL(startURL.slice(0, -4) + (page.pagenum + 1) + '.htm'); // d.htm (start) -> d2.htm -> d3.htm -> ... url = new URL(startURL.slice(0, -4) + (page.pagenum + 1) + '.htm'); // d.htm (start) -> d2.htm -> d3.htm -> ...
await fuzzyDelay(); await fuzzyDelay();
} }
return artists; return artists;
} }
async function saveBandList(filename, url) { async function saveBandList(filename, url) {
let artists = await scrapeAllBandListPages(url); let artists = await scrapeAllBandListPages(url);
await saveJsonData(filename, artists); await saveJsonData(filename, artists);
} }
// nigger :) // nigger :)
async function saveBandLists(urls) { async function saveBandLists(urls) {
let startTime = Date.now(); let startTime = Date.now();
let completed = 0; let completed = 0;
for (const url of urls) { for (const url of urls) {
console.log('doing band list: ' + url); console.log('doing band list: ' + url);
await saveBandList(path.join('output', 'artists', sanitizeFileName(url + '.json')), url); await saveBandList(path.join('output', 'artists', sanitizeFileName(url + '.json')), url);
completed += 1; completed += 1;
let fromNow = formatRelative(estMSRemaining(startTime, completed / urls.length)); let fromNow = formatRelative(estMSRemaining(startTime, completed / urls.length));
console.log(`Save All Band List Status: ${completed} / ${urls.length} band lists complete (${(completed / urls.length * 100).toFixed(2)}%, ${fromNow} remaining)`); console.log(`Save All Band List Status: ${completed} / ${urls.length} band lists complete (${(completed / urls.length * 100).toFixed(2)}%, ${fromNow} remaining)`);
await fuzzyDelay(); await fuzzyDelay();
} }
} }
// Note: modifies artists to add a 'tabs' property to each artist. This property contains a list // Note: modifies artists to add a 'tabs' property to each artist. This property contains a list
// of the artist's tab metadatas (tab text is done in a different step) // of the artist's tab metadatas (tab text is done in a different step)
async function saveArtistsWithTabMetadata(filename, artists) { async function saveArtistsWithTabMetadata(filename, artists) {
const baseURL = 'https://www.ultimate-guitar.com/'; const baseURL = 'https://www.ultimate-guitar.com/';
let startTime = Date.now(); let startTime = Date.now();
let completed = 0; let completed = 0;
let taskQueue = new ConcurrentQueue(8); // Run a maximum of 4 artist tab list scrapers at a time let taskQueue = new ConcurrentQueue(8); // Run a maximum of 4 artist tab list scrapers at a time
// Note: the concurrent queue will (almost certainly) cause the artists to be somewhat to completely out of order in the output // Note: the concurrent queue will (almost certainly) cause the artists to be somewhat to completely out of order in the output
for (let artist of artists) { for (let artist of artists) {
taskQueue.push(async () => { taskQueue.push(async () => {
let artistStartURL = new URL(artist.artist_url, baseURL); let artistStartURL = new URL(artist.artist_url, baseURL);
let artistTabs = await scrapeAllArtistTabListPages(artistStartURL); let artistTabs = await scrapeAllArtistTabListPages(artistStartURL);
artist.tabs = artistTabs; artist.tabs = artistTabs;
completed += 1; completed += 1;
let fromNow = formatRelative(estMSRemaining(startTime, completed / artists.length)); let fromNow = formatRelative(estMSRemaining(startTime, completed / artists.length));
let pctPerMin = ((100 * completed / artists.length) / ((Date.now() - startTime) / (60 * 1000))).toFixed(2); let pctPerMin = ((100 * completed / artists.length) / ((Date.now() - startTime) / (60 * 1000))).toFixed(2);
let artistsPerMin = (completed / ((Date.now() - startTime) / (60 * 1000))).toFixed(2); let artistsPerMin = (completed / ((Date.now() - startTime) / (60 * 1000))).toFixed(2);
console.log(`Save Artists with Tab Metadata Status: ${completed} / ${artists.length} artists complete (${(completed / artists.length * 100).toFixed(2)}%, ${fromNow} remaining, ${pctPerMin} %/min, ${artistsPerMin} artists/min)`); console.log(`Save Artists with Tab Metadata Status: ${completed} / ${artists.length} artists complete (${(completed / artists.length * 100).toFixed(2)}%, ${fromNow} remaining, ${pctPerMin} %/min, ${artistsPerMin} artists/min)`);
}); });
} }
await taskQueue.waitForDrain(); await taskQueue.waitForDrain();
await saveJsonData(filename, artists); await saveJsonData(filename, artists);
} }
module.exports = { module.exports = {
scrapeBands: saveBandLists, scrapeBands: saveBandLists,
scrapeArtistTabUrls: saveArtistsWithTabMetadata scrapeArtistTabUrls: saveArtistsWithTabMetadata
}; };

View File

@ -1,126 +1,126 @@
// Injests from ./input/* into ./output/tabs.db // Injests from ./input/* into ./output/tabs.db
const sqlite3 = require('sqlite3'); const sqlite3 = require('sqlite3');
const sqlite = require('sqlite'); const sqlite = require('sqlite');
const fs = require('fs/promises'); const fs = require('fs/promises');
(async () => { (async () => {
const db = await sqlite.open({ const db = await sqlite.open({
driver: sqlite3.Database, driver: sqlite3.Database,
filename: './output/tabs-no-text.db' filename: './output/tabs-no-text.db'
}); });
await db.run(` await db.run(`
CREATE TABLE IF NOT EXISTS artists ( CREATE TABLE IF NOT EXISTS artists (
scrape_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT scrape_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT
, id INTEGER , id INTEGER
, name TEXT , name TEXT
, tabscount INTEGER , tabscount INTEGER
, artist_url TEXT , artist_url TEXT
, tabs_last_update_timestamp INTEGER , tabs_last_update_timestamp INTEGER
) )
`); `);
await db.run(` await db.run(`
CREATE TABLE IF NOT EXISTS tabs ( CREATE TABLE IF NOT EXISTS tabs (
scrape_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT scrape_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT
, artist_scrape_id INTEGER NOT NULL , artist_scrape_id INTEGER NOT NULL
, id INTEGER , id INTEGER
, song_id INTEGER , song_id INTEGER
, song_name TEXT , song_name TEXT
, artist_id INTEGER , artist_id INTEGER
, artist_name INTEGER , artist_name INTEGER
, type TEXT , type TEXT
, part TEXT , part TEXT
, version INTEGER , version INTEGER
, votes INTEGER , votes INTEGER
, rating NUMERIC , rating NUMERIC
, date TEXT , date TEXT
, status TEXT , status TEXT
, preset_id INTEGER , preset_id INTEGER
, tab_access_type TEXT , tab_access_type TEXT
, tp_version INTEGER , tp_version INTEGER
, tonality_name TEXT , tonality_name TEXT
, version_description TEXT , version_description TEXT
, verified INTEGER , verified INTEGER
, artist_url TEXT , artist_url TEXT
, tab_url TEXT , tab_url TEXT
, tab_text TEXT , tab_text TEXT
, difficulty TEXT , difficulty TEXT
, tuning TEXT , tuning TEXT
, type_name TEXT , type_name TEXT
, FOREIGN KEY (artist_scrape_id) REFERENCES artists(scrape_id) , FOREIGN KEY (artist_scrape_id) REFERENCES artists(scrape_id)
) )
`); `);
// Clear out the database // Clear out the database
await db.run('DELETE FROM tabs'); await db.run('DELETE FROM tabs');
await db.run('DELETE FROM artists'); await db.run('DELETE FROM artists');
const files = await fs.readdir('./input/'); const files = await fs.readdir('./input/');
const stmtAddArtist = await db.prepare(` const stmtAddArtist = await db.prepare(`
INSERT INTO artists ( INSERT INTO artists (
id, name, tabscount, artist_url, tabs_last_update_timestamp id, name, tabscount, artist_url, tabs_last_update_timestamp
) VALUES ( ) VALUES (
?1, ?2, ?3, ?4, ?5 ?1, ?2, ?3, ?4, ?5
) )
`); `);
const stmtAddTab = await db.prepare(` const stmtAddTab = await db.prepare(`
INSERT INTO tabs ( INSERT INTO tabs (
artist_scrape_id artist_scrape_id
, id, song_id, song_name, artist_id, artist_name, type, part , id, song_id, song_name, artist_id, artist_name, type, part
, version, votes, rating, date, status, preset_id, tab_access_type, tp_version, tonality_name , version, votes, rating, date, status, preset_id, tab_access_type, tp_version, tonality_name
, version_description, verified, artist_url, tab_url, tab_text, difficulty, tuning, type_name , version_description, verified, artist_url, tab_url, tab_text, difficulty, tuning, type_name
) VALUES ( ) VALUES (
?1 ?1
, ?2, ?3, ?4, ?5, ?6, ?7, ?8 , ?2, ?3, ?4, ?5, ?6, ?7, ?8
, ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16, ?17 , ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16, ?17
, ?18, ?19, ?20, ?21, ?22, ?23, ?24, ?25 , ?18, ?19, ?20, ?21, ?22, ?23, ?24, ?25
) )
`); `);
function addArtist(id, name, tabscount, artist_url, tabs_last_update_timestamp) { function addArtist(id, name, tabscount, artist_url, tabs_last_update_timestamp) {
return stmtAddArtist.run([id, name, tabscount, artist_url, tabs_last_update_timestamp]); return stmtAddArtist.run([id, name, tabscount, artist_url, tabs_last_update_timestamp]);
} }
function addTab( function addTab(
artist_scrape_id, id, song_id, song_name, artist_id, artist_name, type, part, version, votes, artist_scrape_id, id, song_id, song_name, artist_id, artist_name, type, part, version, votes,
rating, date, status, preset_id, tab_access_type, tp_version, tonality_name, version_description, rating, date, status, preset_id, tab_access_type, tp_version, tonality_name, version_description,
verified, artist_url, tab_url, tab_text, difficulty, tuning, type_name verified, artist_url, tab_url, tab_text, difficulty, tuning, type_name
) { ) {
return stmtAddTab.run([ return stmtAddTab.run([
artist_scrape_id, id, song_id, song_name, artist_id, artist_name, type, part, version, votes, artist_scrape_id, id, song_id, song_name, artist_id, artist_name, type, part, version, votes,
rating, date, status, preset_id, tab_access_type, tp_version, tonality_name, version_description, rating, date, status, preset_id, tab_access_type, tp_version, tonality_name, version_description,
verified, artist_url, tab_url, tab_text, difficulty, tuning, type_name verified, artist_url, tab_url, tab_text, difficulty, tuning, type_name
]); ]);
} }
for (let file of files) { for (let file of files) {
if (!file.endsWith('.json')) continue; // skip the .keep file if (!file.endsWith('.json')) continue; // skip the .keep file
console.log('reading ./input/' + file); console.log('reading ./input/' + file);
let dataJSON = await fs.readFile('./input/' + file); let dataJSON = await fs.readFile('./input/' + file);
let data = JSON.parse(dataJSON); let data = JSON.parse(dataJSON);
let artistIndex = 0; let artistIndex = 0;
for (let artist of data) { for (let artist of data) {
console.log(`adding artist (${artistIndex+1}/${data.length}, ${artist.tabs.length} tabs): ${artist.name}`) console.log(`adding artist (${artistIndex+1}/${data.length}, ${artist.tabs.length} tabs): ${artist.name}`)
let artistResult = await addArtist(artist.id, artist.name, artist.tabscount, artist.artist_url, artist.tabs_last_update_timestamp); let artistResult = await addArtist(artist.id, artist.name, artist.tabscount, artist.artist_url, artist.tabs_last_update_timestamp);
let artistScrapeId = artistResult.lastID; let artistScrapeId = artistResult.lastID;
for (let tab of artist.tabs) { for (let tab of artist.tabs) {
addTab( addTab(
artistScrapeId, artistScrapeId,
tab.id, tab.song_id, tab.song_name, tab.artist_id, tab.artist_name, tab.type, tab.part, tab.version, tab.votes, tab.id, tab.song_id, tab.song_name, tab.artist_id, tab.artist_name, tab.type, tab.part, tab.version, tab.votes,
tab.rating, tab.date, tab.status, tab.preset_id, tab.tab_access_type, tab.tp_version, tab.tonality_name, tab.version_description, tab.rating, tab.date, tab.status, tab.preset_id, tab.tab_access_type, tab.tp_version, tab.tonality_name, tab.version_description,
tab.verified, tab.artist_url, tab.tab_url, null, tab.difficulty, tab.tuning, tab.type_name tab.verified, tab.artist_url, tab.tab_url, null, tab.difficulty, tab.tuning, tab.type_name
); );
} }
artistIndex += 1; artistIndex += 1;
} }
} }
await stmtAddTab.finalize(); await stmtAddTab.finalize();
await stmtAddArtist.finalize(); await stmtAddArtist.finalize();
await db.close(); await db.close();
})(); })();

View File

@ -1,241 +1,241 @@
// Scrapes tabs from ultimate-guitar.com to complete the database // Scrapes tabs from ultimate-guitar.com to complete the database
// node-fetch is an asshole that wants to be ESM-only so we have to do special stuff to import it easily // node-fetch is an asshole that wants to be ESM-only so we have to do special stuff to import it easily
const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args)); const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));
const jsdom = require('jsdom'); const jsdom = require('jsdom');
const sqlite3 = require('sqlite3'); const sqlite3 = require('sqlite3');
const sqlite = require('sqlite'); const sqlite = require('sqlite');
const ConcurrentQueue = require('./concurrent-queue.js'); const ConcurrentQueue = require('./concurrent-queue.js');
class ScrapeError extends Error { class ScrapeError extends Error {
constructor(message, options, fileName, lineNumber) { constructor(message, options, fileName, lineNumber) {
super(...arguments); super(...arguments);
this.name = 'ScrapeError'; this.name = 'ScrapeError';
} }
} }
function estMSRemaining(startTime, ratioComplete) { function estMSRemaining(startTime, ratioComplete) {
return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete); return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete);
} }
function formatRelative(msRelative) { function formatRelative(msRelative) {
if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`; if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`;
else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`; else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`;
else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`; else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`;
else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`; else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`;
} }
function sleep(ms) { function sleep(ms) {
return new Promise((resolve) => { return new Promise((resolve) => {
setTimeout(resolve, ms); setTimeout(resolve, ms);
}); });
} }
// modified from background-script to use jsdom // modified from background-script to use jsdom
async function scrapeUGDataContent(url) { async function scrapeUGDataContent(url) {
let page = null; let page = null;
try { try {
page = await fetch(url); page = await fetch(url);
} catch (e) { } catch (e) {
throw new ScrapeError('Unable to fetch url', { cause: e }); throw new ScrapeError('Unable to fetch url', { cause: e });
} }
let text = null; let text = null;
try { try {
text = await page.text(); text = await page.text();
} catch (e) { } catch (e) {
throw new ScrapeError('Unable to decode page', { cause: e }); throw new ScrapeError('Unable to decode page', { cause: e });
} }
let dom = null; let dom = null;
try { try {
dom = new jsdom.JSDOM(text); dom = new jsdom.JSDOM(text);
} catch (e) { } catch (e) {
throw new ScrapeError('Unable to parse document', { cause: e }); throw new ScrapeError('Unable to parse document', { cause: e });
} }
if (!dom.window || !dom.window.document) { if (!dom.window || !dom.window.document) {
throw new ScrapeError('Unable to parse document'); throw new ScrapeError('Unable to parse document');
} }
let document = dom.window.document; let document = dom.window.document;
const jsStore = document.querySelector('.js-store'); const jsStore = document.querySelector('.js-store');
if (jsStore == null) { if (jsStore == null) {
throw new ScrapeError('Unable to find .js-store element') throw new ScrapeError('Unable to find .js-store element')
} }
const contentJSON = jsStore.getAttribute('data-content'); const contentJSON = jsStore.getAttribute('data-content');
if (contentJSON == null) { if (contentJSON == null) {
throw new ScrapeError('Unable to find data-content attribute on .js-store'); throw new ScrapeError('Unable to find data-content attribute on .js-store');
} }
const content = JSON.parse(contentJSON); const content = JSON.parse(contentJSON);
return content; return content;
} }
function parseGeneralTab(ugDataContent) { function parseGeneralTab(ugDataContent) {
const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');
const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab'); const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab');
const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view'); const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view');
const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab'); const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab');
const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content'); const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content');
return { meta, text }; return { meta, text };
} }
// Can only fetch Bass, Chords, Drums, Tab, and Ukulele type_name tabs // Can only fetch Bass, Chords, Drums, Tab, and Ukulele type_name tabs
async function fetchGeneralTab(url) { async function fetchGeneralTab(url) {
let ugDataContent = await scrapeUGDataContent(url); let ugDataContent = await scrapeUGDataContent(url);
return parseGeneralTab(ugDataContent); return parseGeneralTab(ugDataContent);
} }
(async () => { (async () => {
const db = await sqlite.open({ const db = await sqlite.open({
driver: sqlite3.Database, driver: sqlite3.Database,
filename: './input/tabs.db' filename: './input/tabs.db'
}); });
// Progress queries // Progress queries
let totalFetchable = (await db.get(` let totalFetchable = (await db.get(`
SELECT SELECT
COUNT(*) AS c COUNT(*) AS c
FROM FROM
tabs tabs
WHERE WHERE
tab_url IS NOT NULL tab_url IS NOT NULL
AND type_name IS NOT NULL AND type_name IS NOT NULL
AND ( AND (
type_name='Bass' type_name='Bass'
OR type_name='Chords' OR type_name='Chords'
OR type_name='Drums' OR type_name='Drums'
OR type_name='Tab' OR type_name='Tab'
OR type_name='Ukulele' OR type_name='Ukulele'
) )
`)).c; `)).c;
console.log(`${totalFetchable} Total Fetchable Tabs`) console.log(`${totalFetchable} Total Fetchable Tabs`)
let completedFetchable = (await db.get(` let completedFetchable = (await db.get(`
SELECT SELECT
COUNT(*) AS c COUNT(*) AS c
FROM FROM
tabs tabs
WHERE WHERE
tab_text IS NOT NULL tab_text IS NOT NULL
AND tab_url IS NOT NULL AND tab_url IS NOT NULL
AND type_name IS NOT NULL AND type_name IS NOT NULL
AND ( AND (
type_name='Bass' type_name='Bass'
OR type_name='Chords' OR type_name='Chords'
OR type_name='Drums' OR type_name='Drums'
OR type_name='Tab' OR type_name='Tab'
OR type_name='Ukulele' OR type_name='Ukulele'
) )
`)).c; `)).c;
console.log(`${completedFetchable} (${(100 * completedFetchable / totalFetchable).toFixed(2)}%) Fetchable Tabs already completed`); console.log(`${completedFetchable} (${(100 * completedFetchable / totalFetchable).toFixed(2)}%) Fetchable Tabs already completed`);
let stmtUpdateTab = await db.prepare(` let stmtUpdateTab = await db.prepare(`
UPDATE UPDATE
tabs tabs
SET SET
user_id=?1 user_id=?1
, user_iq=?2 , user_iq=?2
, username=?3 , username=?3
, tab_text=?4 , tab_text=?4
WHERE WHERE
scrape_id=?5 scrape_id=?5
`); `);
// nigger :) // nigger :)
let remainingFetchable = totalFetchable - completedFetchable; let remainingFetchable = totalFetchable - completedFetchable;
let sessionCompleted = 0; let sessionCompleted = 0;
let startTime = Date.now(); let startTime = Date.now();
let badUrls = new Set(); let badUrls = new Set();
while (true) { while (true) {
let queryStartTime = Date.now(); let queryStartTime = Date.now();
let result = await db.all(` let result = await db.all(`
SELECT SELECT
scrape_id scrape_id
, tab_url , tab_url
FROM FROM
tabs tabs
WHERE WHERE
tab_text IS NULL tab_text IS NULL
AND tab_url IS NOT NULL AND tab_url IS NOT NULL
AND type_name IS NOT NULL AND type_name IS NOT NULL
AND ( AND (
type_name='Bass' type_name='Bass'
OR type_name='Chords' OR type_name='Chords'
OR type_name='Drums' OR type_name='Drums'
OR type_name='Tab' OR type_name='Tab'
OR type_name='Ukulele' OR type_name='Ukulele'
) )
ORDER BY ORDER BY
bucket bucket
LIMIT 300 LIMIT 300
`); `);
console.log(`SQLite Query took ${Date.now() - queryStartTime} ms`); console.log(`SQLite Query took ${Date.now() - queryStartTime} ms`);
// console.log('Sleeping for 10s'); // console.log('Sleeping for 10s');
// await sleep(10000); // await sleep(10000);
if (result.length === 0) break; if (result.length === 0) break;
let batchCompleted = 0; let batchCompleted = 0;
let queue = new ConcurrentQueue(5); let queue = new ConcurrentQueue(5);
for (let tabInfo of result) { for (let tabInfo of result) {
if (badUrls.has(tabInfo.tab_url)) continue; if (badUrls.has(tabInfo.tab_url)) continue;
(async () => { (async () => {
try { try {
await queue.push(async () => { await queue.push(async () => {
let { meta, text } = await fetchGeneralTab(tabInfo.tab_url); let { meta, text } = await fetchGeneralTab(tabInfo.tab_url);
let user_id = meta.user_id; let user_id = meta.user_id;
let user_iq = meta.user_iq; let user_iq = meta.user_iq;
let username = meta.username; let username = meta.username;
let tab_text = text; let tab_text = text;
await stmtUpdateTab.run([ user_id, user_iq, username, tab_text, tabInfo.scrape_id ]); await stmtUpdateTab.run([ user_id, user_iq, username, tab_text, tabInfo.scrape_id ]);
batchCompleted += 1; batchCompleted += 1;
if (batchCompleted % (Math.floor(result.length / 10)) === 0) { if (batchCompleted % (Math.floor(result.length / 10)) === 0) {
console.log(`batch completed: ${batchCompleted}/${result.length - badUrls.size}`); console.log(`batch completed: ${batchCompleted}/${result.length - badUrls.size}`);
} }
}); });
} catch (e) { } catch (e) {
console.error('Error fetching tab for ', tabInfo.tab_url, '. Error:', e.message); console.error('Error fetching tab for ', tabInfo.tab_url, '. Error:', e.message);
badUrls.add(tabInfo.tab_url); badUrls.add(tabInfo.tab_url);
} }
})(); })();
await sleep(100); await sleep(100);
} }
await queue.waitForDrain(); await queue.waitForDrain();
sessionCompleted += batchCompleted; sessionCompleted += batchCompleted;
let elapsed = formatRelative(Date.now() - startTime); let elapsed = formatRelative(Date.now() - startTime);
let minsElapsed = (Date.now() - startTime) / (60 * 1000); let minsElapsed = (Date.now() - startTime) / (60 * 1000);
let estimatedRemaining = formatRelative(estMSRemaining(startTime, sessionCompleted / remainingFetchable)); let estimatedRemaining = formatRelative(estMSRemaining(startTime, sessionCompleted / remainingFetchable));
let pctComplete = (100 * sessionCompleted / remainingFetchable); let pctComplete = (100 * sessionCompleted / remainingFetchable);
let pctPerMin = (pctComplete / minsElapsed); let pctPerMin = (pctComplete / minsElapsed);
let tabsPerMin = (sessionCompleted / ((Date.now() - startTime) / (60 * 1000))); let tabsPerMin = (sessionCompleted / ((Date.now() - startTime) / (60 * 1000)));
console.log(''); console.log('');
console.log(`${sessionCompleted}/${remainingFetchable} tabs complete (${pctComplete.toFixed(2)}%)`); console.log(`${sessionCompleted}/${remainingFetchable} tabs complete (${pctComplete.toFixed(2)}%)`);
console.log(`${tabsPerMin.toFixed(2)} tabs/min (${pctPerMin.toFixed(5)} %/min)`); console.log(`${tabsPerMin.toFixed(2)} tabs/min (${pctPerMin.toFixed(5)} %/min)`);
console.log(`${elapsed} elapsed (est. ${estimatedRemaining} remaining)`); console.log(`${elapsed} elapsed (est. ${estimatedRemaining} remaining)`);
console.log(''); console.log('');
if (batchCompleted / result.length < .5) { if (batchCompleted / result.length < .5) {
console.log('We got kicked off at ', new Date().toString()); console.log('We got kicked off at ', new Date().toString());
break; break;
} }
} }
await stmtUpdateTab.finalize(); await stmtUpdateTab.finalize();
await db.close(); await db.close();
})(); })();

View File

@ -1,48 +1,48 @@
// Runs a limited number of promises at one time // Runs a limited number of promises at one time
class ConcurrentQueue { class ConcurrentQueue {
constructor(consecutive) { constructor(consecutive) {
this.consecutive = consecutive; this.consecutive = consecutive;
this.queue = []; this.queue = [];
this.current = 0; this.current = 0;
this.drainListeners = []; this.drainListeners = [];
} }
_checkQueue() { _checkQueue() {
if (this.current == 0 && this.queue.length == 0) { if (this.current == 0 && this.queue.length == 0) {
for (let drainListener of this.drainListeners) { for (let drainListener of this.drainListeners) {
drainListener(); drainListener();
} }
this.drainListeners = []; this.drainListeners = [];
} }
while (this.current < this.consecutive && this.queue.length > 0) { while (this.current < this.consecutive && this.queue.length > 0) {
let taskData = this.queue.shift(); let taskData = this.queue.shift();
this.current += 1; this.current += 1;
(async () => { (async () => {
try { try {
taskData.resolve(await taskData.task()); taskData.resolve(await taskData.task());
} catch (e) { } catch (e) {
taskData.reject(e); taskData.reject(e);
} }
this.current -= 1; this.current -= 1;
this._checkQueue(); this._checkQueue();
})(); })();
} }
} }
// returns a promise that can be awaited to get the resolution or rejection of the task's execution // returns a promise that can be awaited to get the resolution or rejection of the task's execution
push(task) { push(task) {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
this.queue.push({ task, resolve, reject }) this.queue.push({ task, resolve, reject })
this._checkQueue(); this._checkQueue();
}); });
} }
async waitForDrain() { async waitForDrain() {
return new Promise((resolve) => { return new Promise((resolve) => {
this.drainListeners.push(resolve); this.drainListeners.push(resolve);
this._checkQueue(); this._checkQueue();
}); });
} }
} }
module.exports = ConcurrentQueue; module.exports = ConcurrentQueue;

View File

@ -1,126 +1,135 @@
const sqlite3 = require('sqlite3'); const sqlite3 = require('sqlite3');
const sqlite = require('sqlite'); const sqlite = require('sqlite');
const fs = require('fs'); const fs = require('fs');
const path = require('path'); const path = require('path');
const fsExtra = require('fs-extra'); const fsExtra = require('fs-extra');
function sanitizeFileName(name) { function sanitizeFileName(name) {
// Windows Version (created for Windows, most likely works cross-platform too given my research) // Windows Version (created for Windows, most likely works cross-platform too given my research)
// Allowed Characters: Extended Unicode Charset (1-255) // Allowed Characters: Extended Unicode Charset (1-255)
// Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9 // Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9
// Reserved Characters: <>:"/\|?* // Reserved Characters: <>:"/\|?*
// Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names // Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names
// Illegal File Names (Windows) // Illegal File Names (Windows)
if ([ 'CON', 'PRN', 'AUX', 'NUL', if ([ 'CON', 'PRN', 'AUX', 'NUL',
'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' ].indexOf(name) != -1) { // TODO: case insensitive? 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' ].indexOf(name) != -1) { // TODO: case insensitive?
name += '_'; name += '_';
} }
// Reserved Characters // Reserved Characters
name = name.replace(/[<>:\"\/\\|?*]/g, ''); name = name.replace(/[<>:\"\/\\|?*]/g, '');
// Allowed Characters // Allowed Characters
return name.split('').map(c => c.charCodeAt(0) < 255 && c.charCodeAt(0) > 0 ? c : '_').join(''); return name.split('').map(c => c.charCodeAt(0) < 255 && c.charCodeAt(0) > 0 ? c : '_').join('');
// Much stricter whitelist version // Much stricter whitelist version
// replace bad characters with '_' // replace bad characters with '_'
//return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join(''); //return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join('');
} }
function getAvailableFileName(dir, name) { function getAvailableFileName(dir, name) {
name = sanitizeFileName(name); name = sanitizeFileName(name);
let ext = path.extname(name); let ext = path.extname(name);
let baseName = path.basename(name, ext); let baseName = path.basename(name, ext);
let availableBaseName = baseName; let availableBaseName = baseName;
let tries = 1; let tries = 1;
while (fs.existsSync(path.join(dir, availableBaseName + ext))) { while (fs.existsSync(path.join(dir, availableBaseName + ext))) {
availableBaseName = baseName + '-' + (++tries); availableBaseName = baseName + '-' + (++tries);
} }
return availableBaseName + ext; return availableBaseName + ext;
} }
(async () => { function cleanTab(tab_text) {
// Clear out old output directory return tab_text
await fsExtra.emptyDir('./output/'); .replace(/\[tab\]/g, '')
.replace(/\[\/tab\]/g, '')
const db = await sqlite.open({ .replace(/\[ch]/g, '')
driver: sqlite3.Database, .replace(/\[\/ch\]/g, '');
filename: './input/tabs-full.db' }
});
(async () => {
console.log('connected to db'); // Clear out old output directory
await fsExtra.emptyDir('./output/');
let total = (await db.get(`
SELECT COUNT(*) AS c FROM tabs WHERE tab_text IS NOT NULL const db = await sqlite.open({
`)).c; driver: sqlite3.Database,
filename: './input/tabs-full.db'
console.log(`${total} total tabs`); });
let soFar = 0; console.log('connected to db');
const totalRows = await db.each(`
SELECT let total = (await db.get(`
scrape_id SELECT COUNT(*) AS c FROM tabs WHERE tab_text IS NOT NULL
, id `)).c;
, song_id
, song_name console.log(`${total} total tabs`);
, artist_id
, artist_name let soFar = 0;
, version const totalRows = await db.each(`
, version_description SELECT
, votes scrape_id
, rating , id
, date , song_id
, tonality_name , song_name
, verified , artist_id
, artist_url , artist_name
, tab_url , version
, difficulty , version_description
, tuning , votes
, type_name , rating
, user_id , date
, user_iq , tonality_name
, username , verified
, tab_text , artist_url
FROM tabs , tab_url
WHERE tab_text IS NOT NULL , difficulty
`, (err, row) => { , tuning
if (err) throw err; , type_name
, user_id
soFar += 1; , user_iq
let fileText = , username
`${row.song_name} [${row.song_id}]: ${row.tab_url} , tab_text
By ${row.artist_name} [${row.artist_id}]: ${row.artist_url} FROM tabs
Rating: ${row.rating}, Votes: ${row.votes} WHERE tab_text IS NOT NULL
Date: ${row.date} ORDER BY rating * votes + votes
Tonality: ${row.tonality_name} `, (err, row) => {
Difficulty: ${row.difficulty} if (err) throw err;
Tuning: ${row.tuning}
Type: ${row.type_name} soFar += 1;
Tab By: ${row.username} [${row.user_id}] (${row.user_iq} iq) let fileText =
Verified: ${row.verified} `${row.song_name} [${row.song_id}]: ${row.tab_url}
Version ${row.version} By ${row.artist_name} [${row.artist_id}]: ${row.artist_url}
${row.version_description || ''} Rating: ${row.rating}, Votes: ${row.votes}
Tab By: ${row.username} [${row.user_id}] (${row.user_iq} iq)
${row.tab_text} Last Edit: ${new Date(row.date * 1000).toLocaleString()}${row.version_description ? '\n------------------------------------------------------------------------\n' + row.version_description : ''}
`; ------------------------------------------------------------------------
let typeDir = path.join('output', row.type_name ?? 'null'); ${cleanTab(row.tab_text).trim()/* Remove [bbcode]tags[/bbcode] */}
if (!fs.existsSync(typeDir)) { ------------------------------------------------------------------------
fs.mkdirSync(typeDir); Tonality: ${row.tonality_name}
} Difficulty: ${row.difficulty}
Tuning: ${row.tuning}
let fileDir = sanitizeFileName(row.artist_name + '-' + row.artist_id); Type: ${row.type_name}
if (!fs.existsSync(path.join(typeDir, fileDir))) { Verified: ${row.verified}
fs.mkdirSync(path.join(typeDir, fileDir)); Version ${row.version}
} `;
let typeDir = path.join('output', row.type_name ?? 'null');
let fileName = getAvailableFileName(path.join(typeDir, fileDir), row.song_name + '.txt'); if (!fs.existsSync(typeDir)) {
fs.writeFileSync(path.join(typeDir, fileDir, fileName), fileText); fs.mkdirSync(typeDir);
}
if (soFar % 100 == 0) {
console.log(`Tab #${soFar}/${total} (${(100 * soFar / total).toFixed(2)}%): ${path.join(typeDir, fileDir, fileName)}`); let fileDir = sanitizeFileName(row.artist_name + '-' + row.artist_id);
} if (!fs.existsSync(path.join(typeDir, fileDir))) {
}); fs.mkdirSync(path.join(typeDir, fileDir));
}
})();
let fileName = getAvailableFileName(path.join(typeDir, fileDir), row.song_name + '.txt');
fs.writeFileSync(path.join(typeDir, fileDir, fileName), fileText);
if (soFar % 100 == 0) {
console.log(`Tab #${soFar}/${total} (${(100 * soFar / total).toFixed(2)}%): ${path.join(typeDir, fileDir, fileName)}`);
}
});
})();

548
README.md
View File

@ -1,274 +1,274 @@
# ultimate-guitar.com Tab Scraper # ultimate-guitar.com Tab Scraper
This file set allows for a 6 step process to scrape the tabs off ultimate-guitar.com. This file set allows for a 6 step process to scrape the tabs off ultimate-guitar.com.
It takes advantage of a "feature" in the ultimate-guitar.com's rendering techniques that puts website data inside of a div with class '.js-store'. It takes advantage of a "feature" in the ultimate-guitar.com's rendering techniques that puts website data inside of a div with class '.js-store'.
Scraping all 1.1 million public tabs from the site is pretty easy and can be done in 6 steps. Scraping all 1.1 million public tabs from the site is pretty easy and can be done in 6 steps.
You're going to want a VPN for this because you will get kicked off and IP blocked every 2-8 hours (depending on how agressive you are when scraping). Reconnect to another IP and you'll be good to continue scraping. You're going to want a VPN for this because you will get kicked off and IP blocked every 2-8 hours (depending on how agressive you are when scraping). Reconnect to another IP and you'll be good to continue scraping.
Download the sqlite3 command line client from https://sqlite.org/download.html Download the sqlite3 command line client from https://sqlite.org/download.html
Download Node.js from https://nodejs.org/ Download Node.js from https://nodejs.org/
## 1. Scrape Tab URLs ## 1. Scrape Tab URLs
This step maps out all pages on ultimate-guitar.com that can be scraped This step maps out all pages on ultimate-guitar.com that can be scraped
Enter 01-scraper-urls Enter 01-scraper-urls
Run Run
> npm install > npm install
Open up 01-scrape-bands.js and customize the band list links Open up 01-scrape-bands.js and customize the band list links
Run Run
> node 01-scrape-bands.js > node 01-scrape-bands.js
This script will save the artist data to output/artists/*.json This script will save the artist data to output/artists/*.json
Open up 02-scrape-artist-tab-urls.js and customize the artist file list Open up 02-scrape-artist-tab-urls.js and customize the artist file list
Run Run
> node 02-scrape-artist-tab-urls.js > node 02-scrape-artist-tab-urls.js
This script will add tab information to the artist data and save it to output/artists-with-tabs/*.json This script will add tab information to the artist data and save it to output/artists-with-tabs/*.json
## 2. Injest scraped urls into a sqlite database ## 2. Injest scraped urls into a sqlite database
This step converts the .json files into a sqlite database to allow scraping to be paused and restarted easily. This step converts the .json files into a sqlite database to allow scraping to be paused and restarted easily.
Copy 01-scraper-urls/output/artists-with-tabs/*.json into 02-injest-sqlite/input/ Copy 01-scraper-urls/output/artists-with-tabs/*.json into 02-injest-sqlite/input/
Run Run
> npm install > npm install
> node 01-injest-sqlite.js > node 01-injest-sqlite.js
Note: this script queues up the artist inserts into the database and then waits for the inserts to finish. Don't be surprised if it hangs for a few hours (it took 6 hours on my SSD). Note: this script queues up the artist inserts into the database and then waits for the inserts to finish. Don't be surprised if it hangs for a few hours (it took 6 hours on my SSD).
This script will create a sqlite database, output/tabs-no-text.db This script will create a sqlite database, output/tabs-no-text.db
## 3. (Optional) Split the sqlite database for parallelized scraping ## 3. (Optional) Split the sqlite database for parallelized scraping
This step splits the generated sqlite database into multiple databases so that you can more easily use multiple machines to scrape the site. This step splits the generated sqlite database into multiple databases so that you can more easily use multiple machines to scrape the site.
Copy the tabs-no-text.db into 03-splitter-sqlite/ Copy the tabs-no-text.db into 03-splitter-sqlite/
Determine the number of machines that you want to run the scraper on. Call this number N Determine the number of machines that you want to run the scraper on. Call this number N
Open up tabs-no-text.db Open up tabs-no-text.db
> sqlite3 tabs-no-text.db > sqlite3 tabs-no-text.db
Create a view that separates the tabs into buckets Create a view that separates the tabs into buckets
> CREATE VIEW tabs_bucketed AS SELECT *, NTILE(N) OVER (ORDER BY rowid) AS bucket FROM tabs; > CREATE VIEW tabs_bucketed AS SELECT *, NTILE(N) OVER (ORDER BY rowid) AS bucket FROM tabs;
For each machine, i: For each machine, i:
1) Create a new database 1) Create a new database
> sqlite3 tabs-i-no-text.db > sqlite3 tabs-i-no-text.db
2) Attach to the base database 2) Attach to the base database
> ATTACH 'tabs-no-text.db' AS db2; > ATTACH 'tabs-no-text.db' AS db2;
3) Create a table with the rows from the machine's bucket 3) Create a table with the rows from the machine's bucket
> CREATE TABLE tabs AS SELECT * FROM db2.tabs_bucketed WHERE bucket=i; > CREATE TABLE tabs AS SELECT * FROM db2.tabs_bucketed WHERE bucket=i;
Make sure to hold onto your tabs-no-text.db database for the merging process. Make sure to hold onto your tabs-no-text.db database for the merging process.
## 4. Scrape the tabs ## 4. Scrape the tabs
For each machine, i For each machine, i
Copy the machine's tabs database into 04-scraper-tabs/input/tabs.db Copy the machine's tabs database into 04-scraper-tabs/input/tabs.db
Important: Make sure to **change the name** of the database file to **tabs.db** OR the file name in 01-scrape.js:100 - filename: './input/tabs-laptop.db'. Important: Make sure to **change the name** of the database file to **tabs.db** OR the file name in 01-scrape.js:100 - filename: './input/tabs-laptop.db'.
### 4.1 (Recommended) Index your database ### 4.1 (Recommended) Index your database
It's highly recommended to index your database It's highly recommended to index your database
Open up 04-scraper-tabs/input/ Open up 04-scraper-tabs/input/
Run Run
> sqlite3 tabs.db > sqlite3 tabs.db
Run the following SQL queries Run the following SQL queries
CREATE INDEX IF NOT EXISTS tabs_scrape_id_idx ON tabs (scrape_id); CREATE INDEX IF NOT EXISTS tabs_scrape_id_idx ON tabs (scrape_id);
CREATE INDEX IF NOT EXISTS tabs_tab_url_idx ON tabs (tab_url); CREATE INDEX IF NOT EXISTS tabs_tab_url_idx ON tabs (tab_url);
CREATE INDEX IF NOT EXISTS tabs_type_name_idx ON tabs (type_name); CREATE INDEX IF NOT EXISTS tabs_type_name_idx ON tabs (type_name);
### 4.2 (Optional) Optimize the tab scraper ### 4.2 (Optional) Optimize the tab scraper
Customize 01-scrape-tabs.js Customize 01-scrape-tabs.js
Key Lines: Key Lines:
1) Line 192: let queue = new ConcurrentQueue(5); 1) Line 192: let queue = new ConcurrentQueue(5);
- - Increasing this value will increase the number of concurrent requests sent. - - Increasing this value will increase the number of concurrent requests sent.
- - Note: Higher concurrent request counts result in more agressive scrapes that may run more quickly but also get you kicked off more quickly - - Note: Higher concurrent request counts result in more agressive scrapes that may run more quickly but also get you kicked off more quickly
2) Line 214: await sleep(100); 2) Line 214: await sleep(100);
- - Increasing this value (in ms) will increase the delay between the first few concurrent requests. This staggers the requests, potentially reducing the chance you get kicked off. - - Increasing this value (in ms) will increase the delay between the first few concurrent requests. This staggers the requests, potentially reducing the chance you get kicked off.
3) Line 183: LIMIT 300 3) Line 183: LIMIT 300
- - Increasing the value in this line will increase the number of tabs scraped from the database at a time before sending a status update. Lower values will query the database more but give more frequent status updates. Higher values will take up more space in process memory and give less frequent status updates. - - Increasing the value in this line will increase the number of tabs scraped from the database at a time before sending a status update. Lower values will query the database more but give more frequent status updates. Higher values will take up more space in process memory and give less frequent status updates.
- - If this value is set too low, removed tab urls will likely fill up the result set, causing the program to incorrectly detect that it got kicked off. - - If this value is set too low, removed tab urls will likely fill up the result set, causing the program to incorrectly detect that it got kicked off.
I found that running 500 tabs/minute gave me a good balance in effort spent reconnecting to VPN and scraping speed. Typically I would have to reset the scraper every 4-6 hours with this rate. I found that running 500 tabs/minute gave me a good balance in effort spent reconnecting to VPN and scraping speed. Typically I would have to reset the scraper every 4-6 hours with this rate.
I got this with concurrency=5 and sleep=100. I got this with concurrency=5 and sleep=100.
### 4.4 Add required columns for scraping ### 4.4 Add required columns for scraping
Open your tabs.db in sqlite Open your tabs.db in sqlite
> sqlite3 tabs.db > sqlite3 tabs.db
Run the following commands to add the needed columns: Run the following commands to add the needed columns:
ALTER TABLE tabs DROP COLUMN tab_text; ALTER TABLE tabs DROP COLUMN tab_text;
ALTER TABLE tabs ADD COLUMN user_id INTEGER; ALTER TABLE tabs ADD COLUMN user_id INTEGER;
ALTER TABLE tabs ADD COLUMN user_iq INTEGER; ALTER TABLE tabs ADD COLUMN user_iq INTEGER;
ALTER TABLE tabs ADD COLUMN username TEXT; ALTER TABLE tabs ADD COLUMN username TEXT;
ALTER TABLE tabs ADD COLUMN tab_text TEXT; ALTER TABLE tabs ADD COLUMN tab_text TEXT;
### 4.3 Scrape the tabs ### 4.3 Scrape the tabs
Important: Make sure to **change the name** of the database file to **tabs.db** OR the file name in 01-scrape.js:100 - filename: './input/tabs-laptop.db'. Important: Make sure to **change the name** of the database file to **tabs.db** OR the file name in 01-scrape.js:100 - filename: './input/tabs-laptop.db'.
Run Run
> npm install > npm install
> node 01-scrape-tabs.js > node 01-scrape-tabs.js
This will add tab information to the tabs.db database. This will add tab information to the tabs.db database.
Note: This scraper only works for the following tab types: Note: This scraper only works for the following tab types:
- Bass - Bass
- Chords - Chords
- Drums - Drums
- Tab - Tab
- Ukulele - Ukulele
The following tab types are not supported: The following tab types are not supported:
- Guitar Pro - Guitar Pro
- Official - Official
- Power - Power
- Video - Video
## 5. (Optional) Merge the sqlite databases ## 5. (Optional) Merge the sqlite databases
Move the partial tabs databases from each machine to 05-merger-sqlite/input/{tabs-i.db} Move the partial tabs databases from each machine to 05-merger-sqlite/input/{tabs-i.db}
- Note: i is the machine number from before - Note: i is the machine number from before
Move the tabs-no-text.db database from step 2 into 05-merger-sqlite/input/. Move the tabs-no-text.db database from step 2 into 05-merger-sqlite/input/.
Open up tabs-no-text.db Open up tabs-no-text.db
> sqlite3 tabs-no-text.db > sqlite3 tabs-no-text.db
Create an index on tabs.tab_url Create an index on tabs.tab_url
> CREATE INDEX tabs_tab_url_idx ON tabs (tab_url); > CREATE INDEX tabs_tab_url_idx ON tabs (tab_url);
Open up the final tabs database, tabs-full.db Open up the final tabs database, tabs-full.db
> sqlite3 tabs-full.db > sqlite3 tabs-full.db
Attach the no-text database Attach the no-text database
> ATTACH 'tabs-no-text.db' AS 'dbnt'; > ATTACH 'tabs-no-text.db' AS 'dbnt';
Attach each machine database in the following format: Attach each machine database in the following format:
> ATTACH 'tabs-i.db' AS 'dbi'; > ATTACH 'tabs-i.db' AS 'dbi';
Create a the final tabs table Create a the final tabs table
CREATE TABLE tabs ( CREATE TABLE tabs (
scrape_id INTEGER scrape_id INTEGER
, artist_scrape_id INTEGER NOT NULL , artist_scrape_id INTEGER NOT NULL
, id INTEGER , id INTEGER
, song_id INTEGER , song_id INTEGER
, song_name TEXT , song_name TEXT
, artist_id INTEGER , artist_id INTEGER
, artist_name INTEGER , artist_name INTEGER
, type TEXT , type TEXT
, part TEXT , part TEXT
, version INTEGER , version INTEGER
, votes INTEGER , votes INTEGER
, rating NUMERIC , rating NUMERIC
, date TEXT , date TEXT
, status TEXT , status TEXT
, preset_id INTEGER , preset_id INTEGER
, tab_access_type TEXT , tab_access_type TEXT
, tp_version INTEGER , tp_version INTEGER
, tonality_name TEXT , tonality_name TEXT
, version_description TEXT , version_description TEXT
, verified INTEGER , verified INTEGER
, artist_url TEXT , artist_url TEXT
, tab_url TEXT , tab_url TEXT
, difficulty TEXT , difficulty TEXT
, tuning TEXT , tuning TEXT
, type_name TEXT , type_name TEXT
, user_id INTEGER , user_id INTEGER
, user_iq INTEGER , user_iq INTEGER
, username TEXT , username TEXT
, tab_text TEXT , tab_text TEXT
); );
For each machine, insert its respective tabs into the table For each machine, insert its respective tabs into the table
INSERT INTO INSERT INTO
tabs tabs
SELECT SELECT
tabsnt.scrape_id tabsnt.scrape_id
, tabsnt.artist_scrape_id , tabsnt.artist_scrape_id
, tabsnt.id , tabsnt.id
, tabsnt.song_id , tabsnt.song_id
, tabsnt.song_name , tabsnt.song_name
, tabsnt.artist_id , tabsnt.artist_id
, tabsnt.artist_name , tabsnt.artist_name
, tabsnt.type , tabsnt.type
, tabsnt.part , tabsnt.part
, tabsnt.version , tabsnt.version
, tabsnt.votes , tabsnt.votes
, tabsnt.rating , tabsnt.rating
, tabsnt.date , tabsnt.date
, tabsnt.status , tabsnt.status
, tabsnt.preset_id , tabsnt.preset_id
, tabsnt.tab_access_type , tabsnt.tab_access_type
, tabsnt.tp_version , tabsnt.tp_version
, tabsnt.tonality_name , tabsnt.tonality_name
, tabsnt.version_description , tabsnt.version_description
, tabsnt.verified , tabsnt.verified
, tabsnt.artist_url , tabsnt.artist_url
, tabsnt.tab_url , tabsnt.tab_url
, tabsnt.difficulty , tabsnt.difficulty
, tabsnt.tuning , tabsnt.tuning
, tabsnt.type_name , tabsnt.type_name
, tabsm.user_id , tabsm.user_id
, tabsm.user_iq , tabsm.user_iq
, tabsm.username , tabsm.username
, tabsm.tab_text , tabsm.tab_text
FROM FROM
dbnt.tabs AS tabsnt dbnt.tabs AS tabsnt
JOIN dbi.tabs AS tabsm ON tabsnt.tab_url=tabsm.tab_url JOIN dbi.tabs AS tabsm ON tabsnt.tab_url=tabsm.tab_url
WHERE WHERE
tabsm.tab_url IS NOT NULL tabsm.tab_url IS NOT NULL
AND tabsm.tab_text IS NOT NULL; AND tabsm.tab_text IS NOT NULL;
Note: this command can take a bit to complete (30s-2m) depending on how large your databases are. Note: this command can take a bit to complete (30s-2m) depending on how large your databases are.
## 6. Print the contents of the database into organized text files ## 6. Print the contents of the database into organized text files
Copy your filled tabs database to 06-output-generator/input/tabs-full.db Copy your filled tabs database to 06-output-generator/input/tabs-full.db
Note: Make sure you either rename it in the directory or update 01-output-generator.js:52 with the proper file name Note: Make sure you either rename it in the directory or update 01-output-generator.js:52 with the proper file name
Run Run
> npm install > npm install
> node --max-old-space-size=16384 01-output-generator.js > node --max-old-space-size=16384 01-output-generator.js
Note: depending on how many tabs you scraped, you may have to increase the max-old-space-size (Max RAM). This example uses 16GB of ram. Note: depending on how many tabs you scraped, you may have to increase the max-old-space-size (Max RAM). This example uses 16GB of ram.
- I'm suspicious the memory leak is in the sqlite package >:| - I'm suspicious the memory leak is in the sqlite package >:|
Congratulations! Your guitar tabs are now organized in: Congratulations! Your guitar tabs are now organized in:
06-output-generator/output/{type}/{artist}-{artist_id}/{song}.txt 06-output-generator/output/{type}/{artist}-{artist_id}/{song}.txt
## Other Information ## Other Information
You can customize the output generator's file output by modifying the fileText variable in 01-output-gernerator.js:84-99 You can customize the output generator's file output by modifying the fileText variable in 01-output-gernerator.js:84-99
Note: The .keep files can be ignored/deleted. They are kept to keep the default directory structure in git. Note: The .keep files can be ignored/deleted. They are kept to keep the default directory structure in git.