tab-yoinker/04-scraper-tabs/01-scrape-tabs.js
2022-08-01 19:03:29 -05:00

242 lines
7.2 KiB
JavaScript

// Scrapes tabs from ultimate-guitar.com to complete the database
// node-fetch is an asshole that wants to be ESM-only so we have to do special stuff to import it easily
const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));
const jsdom = require('jsdom');
const sqlite3 = require('sqlite3');
const sqlite = require('sqlite');
const ConcurrentQueue = require('./concurrent-queue.js');
class ScrapeError extends Error {
constructor(message, options, fileName, lineNumber) {
super(...arguments);
this.name = 'ScrapeError';
}
}
function estMSRemaining(startTime, ratioComplete) {
return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete);
}
function formatRelative(msRelative) {
if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`;
else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`;
else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`;
else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`;
}
function sleep(ms) {
return new Promise((resolve) => {
setTimeout(resolve, ms);
});
}
// modified from background-script to use jsdom
async function scrapeUGDataContent(url) {
let page = null;
try {
page = await fetch(url);
} catch (e) {
throw new ScrapeError('Unable to fetch url', { cause: e });
}
let text = null;
try {
text = await page.text();
} catch (e) {
throw new ScrapeError('Unable to decode page', { cause: e });
}
let dom = null;
try {
dom = new jsdom.JSDOM(text);
} catch (e) {
throw new ScrapeError('Unable to parse document', { cause: e });
}
if (!dom.window || !dom.window.document) {
throw new ScrapeError('Unable to parse document');
}
let document = dom.window.document;
const jsStore = document.querySelector('.js-store');
if (jsStore == null) {
throw new ScrapeError('Unable to find .js-store element')
}
const contentJSON = jsStore.getAttribute('data-content');
if (contentJSON == null) {
throw new ScrapeError('Unable to find data-content attribute on .js-store');
}
const content = JSON.parse(contentJSON);
return content;
}
function parseGeneralTab(ugDataContent) {
const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');
const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab');
const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view');
const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab');
const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content');
return { meta, text };
}
// Can only fetch Bass, Chords, Drums, Tab, and Ukulele type_name tabs
async function fetchGeneralTab(url) {
let ugDataContent = await scrapeUGDataContent(url);
return parseGeneralTab(ugDataContent);
}
(async () => {
const db = await sqlite.open({
driver: sqlite3.Database,
filename: './input/tabs.db'
});
// Progress queries
let totalFetchable = (await db.get(`
SELECT
COUNT(*) AS c
FROM
tabs
WHERE
tab_url IS NOT NULL
AND type_name IS NOT NULL
AND (
type_name='Bass'
OR type_name='Chords'
OR type_name='Drums'
OR type_name='Tab'
OR type_name='Ukulele'
)
`)).c;
console.log(`${totalFetchable} Total Fetchable Tabs`)
let completedFetchable = (await db.get(`
SELECT
COUNT(*) AS c
FROM
tabs
WHERE
tab_text IS NOT NULL
AND tab_url IS NOT NULL
AND type_name IS NOT NULL
AND (
type_name='Bass'
OR type_name='Chords'
OR type_name='Drums'
OR type_name='Tab'
OR type_name='Ukulele'
)
`)).c;
console.log(`${completedFetchable} (${(100 * completedFetchable / totalFetchable).toFixed(2)}%) Fetchable Tabs already completed`);
let stmtUpdateTab = await db.prepare(`
UPDATE
tabs
SET
user_id=?1
, user_iq=?2
, username=?3
, tab_text=?4
WHERE
scrape_id=?5
`);
// nigger :)
let remainingFetchable = totalFetchable - completedFetchable;
let sessionCompleted = 0;
let startTime = Date.now();
let badUrls = new Set();
while (true) {
let queryStartTime = Date.now();
let result = await db.all(`
SELECT
scrape_id
, tab_url
FROM
tabs
WHERE
tab_text IS NULL
AND tab_url IS NOT NULL
AND type_name IS NOT NULL
AND (
type_name='Bass'
OR type_name='Chords'
OR type_name='Drums'
OR type_name='Tab'
OR type_name='Ukulele'
)
ORDER BY
bucket
LIMIT 300
`);
console.log(`SQLite Query took ${Date.now() - queryStartTime} ms`);
// console.log('Sleeping for 10s');
// await sleep(10000);
if (result.length === 0) break;
let batchCompleted = 0;
let queue = new ConcurrentQueue(5);
for (let tabInfo of result) {
if (badUrls.has(tabInfo.tab_url)) continue;
(async () => {
try {
await queue.push(async () => {
let { meta, text } = await fetchGeneralTab(tabInfo.tab_url);
let user_id = meta.user_id;
let user_iq = meta.user_iq;
let username = meta.username;
let tab_text = text;
await stmtUpdateTab.run([ user_id, user_iq, username, tab_text, tabInfo.scrape_id ]);
batchCompleted += 1;
if (batchCompleted % (Math.floor(result.length / 10)) === 0) {
console.log(`batch completed: ${batchCompleted}/${result.length - badUrls.size}`);
}
});
} catch (e) {
console.error('Error fetching tab for ', tabInfo.tab_url, '. Error:', e.message);
badUrls.add(tabInfo.tab_url);
}
})();
await sleep(100);
}
await queue.waitForDrain();
sessionCompleted += batchCompleted;
let elapsed = formatRelative(Date.now() - startTime);
let minsElapsed = (Date.now() - startTime) / (60 * 1000);
let estimatedRemaining = formatRelative(estMSRemaining(startTime, sessionCompleted / remainingFetchable));
let pctComplete = (100 * sessionCompleted / remainingFetchable);
let pctPerMin = (pctComplete / minsElapsed);
let tabsPerMin = (sessionCompleted / ((Date.now() - startTime) / (60 * 1000)));
console.log('');
console.log(`${sessionCompleted}/${remainingFetchable} tabs complete (${pctComplete.toFixed(2)}%)`);
console.log(`${tabsPerMin.toFixed(2)} tabs/min (${pctPerMin.toFixed(5)} %/min)`);
console.log(`${elapsed} elapsed (est. ${estimatedRemaining} remaining)`);
console.log('');
if (batchCompleted / result.length < .5) {
console.log('We got kicked off at ', new Date().toString());
break;
}
}
await stmtUpdateTab.finalize();
await db.close();
})();