// Scrapes tabs from ultimate-guitar.com to complete the database // node-fetch is an asshole that wants to be ESM-only so we have to do special stuff to import it easily const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args)); const jsdom = require('jsdom'); const sqlite3 = require('sqlite3'); const sqlite = require('sqlite'); const ConcurrentQueue = require('./concurrent-queue.js'); class ScrapeError extends Error { constructor(message, options, fileName, lineNumber) { super(...arguments); this.name = 'ScrapeError'; } } function estMSRemaining(startTime, ratioComplete) { return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete); } function formatRelative(msRelative) { if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`; else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`; else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`; else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`; } function sleep(ms) { return new Promise((resolve) => { setTimeout(resolve, ms); }); } // modified from background-script to use jsdom async function scrapeUGDataContent(url) { let page = null; try { page = await fetch(url); } catch (e) { throw new ScrapeError('Unable to fetch url', { cause: e }); } let text = null; try { text = await page.text(); } catch (e) { throw new ScrapeError('Unable to decode page', { cause: e }); } let dom = null; try { dom = new jsdom.JSDOM(text); } catch (e) { throw new ScrapeError('Unable to parse document', { cause: e }); } if (!dom.window || !dom.window.document) { throw new ScrapeError('Unable to parse document'); } let document = dom.window.document; const jsStore = document.querySelector('.js-store'); if (jsStore == null) { throw new ScrapeError('Unable to find .js-store element') } const contentJSON = jsStore.getAttribute('data-content'); if (contentJSON == null) { throw new ScrapeError('Unable to find data-content attribute on .js-store'); } const content = JSON.parse(contentJSON); return content; } function parseGeneralTab(ugDataContent) { const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab'); const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view'); const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab'); const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content'); return { meta, text }; } // Can only fetch Bass, Chords, Drums, Tab, and Ukulele type_name tabs async function fetchGeneralTab(url) { let ugDataContent = await scrapeUGDataContent(url); return parseGeneralTab(ugDataContent); } (async () => { const db = await sqlite.open({ driver: sqlite3.Database, filename: './input/tabs.db' }); // Progress queries let totalFetchable = (await db.get(` SELECT COUNT(*) AS c FROM tabs WHERE tab_url IS NOT NULL AND type_name IS NOT NULL AND ( type_name='Bass' OR type_name='Chords' OR type_name='Drums' OR type_name='Tab' OR type_name='Ukulele' ) `)).c; console.log(`${totalFetchable} Total Fetchable Tabs`) let completedFetchable = (await db.get(` SELECT COUNT(*) AS c FROM tabs WHERE tab_text IS NOT NULL AND tab_url IS NOT NULL AND type_name IS NOT NULL AND ( type_name='Bass' OR type_name='Chords' OR type_name='Drums' OR type_name='Tab' OR type_name='Ukulele' ) `)).c; console.log(`${completedFetchable} (${(100 * completedFetchable / totalFetchable).toFixed(2)}%) Fetchable Tabs already completed`); let stmtUpdateTab = await db.prepare(` UPDATE tabs SET user_id=?1 , user_iq=?2 , username=?3 , tab_text=?4 WHERE scrape_id=?5 `); // nigger :) let remainingFetchable = totalFetchable - completedFetchable; let sessionCompleted = 0; let startTime = Date.now(); let badUrls = new Set(); while (true) { let queryStartTime = Date.now(); let result = await db.all(` SELECT scrape_id , tab_url FROM tabs WHERE tab_text IS NULL AND tab_url IS NOT NULL AND type_name IS NOT NULL AND ( type_name='Bass' OR type_name='Chords' OR type_name='Drums' OR type_name='Tab' OR type_name='Ukulele' ) ORDER BY bucket LIMIT 300 `); console.log(`SQLite Query took ${Date.now() - queryStartTime} ms`); // console.log('Sleeping for 10s'); // await sleep(10000); if (result.length === 0) break; let batchCompleted = 0; let queue = new ConcurrentQueue(5); for (let tabInfo of result) { if (badUrls.has(tabInfo.tab_url)) continue; (async () => { try { await queue.push(async () => { let { meta, text } = await fetchGeneralTab(tabInfo.tab_url); let user_id = meta.user_id; let user_iq = meta.user_iq; let username = meta.username; let tab_text = text; await stmtUpdateTab.run([ user_id, user_iq, username, tab_text, tabInfo.scrape_id ]); batchCompleted += 1; if (batchCompleted % (Math.floor(result.length / 10)) === 0) { console.log(`batch completed: ${batchCompleted}/${result.length - badUrls.size}`); } }); } catch (e) { console.error('Error fetching tab for ', tabInfo.tab_url, '. Error:', e.message); badUrls.add(tabInfo.tab_url); } })(); await sleep(100); } await queue.waitForDrain(); sessionCompleted += batchCompleted; let elapsed = formatRelative(Date.now() - startTime); let minsElapsed = (Date.now() - startTime) / (60 * 1000); let estimatedRemaining = formatRelative(estMSRemaining(startTime, sessionCompleted / remainingFetchable)); let pctComplete = (100 * sessionCompleted / remainingFetchable); let pctPerMin = (pctComplete / minsElapsed); let tabsPerMin = (sessionCompleted / ((Date.now() - startTime) / (60 * 1000))); console.log(''); console.log(`${sessionCompleted}/${remainingFetchable} tabs complete (${pctComplete.toFixed(2)}%)`); console.log(`${tabsPerMin.toFixed(2)} tabs/min (${pctPerMin.toFixed(5)} %/min)`); console.log(`${elapsed} elapsed (est. ${estimatedRemaining} remaining)`); console.log(''); if (batchCompleted / result.length < .5) { console.log('We got kicked off at ', new Date().toString()); break; } } await stmtUpdateTab.finalize(); await db.close(); })();