242 lines
7.2 KiB
JavaScript
242 lines
7.2 KiB
JavaScript
// Scrapes tabs from ultimate-guitar.com to complete the database
|
|
|
|
// node-fetch is an asshole that wants to be ESM-only so we have to do special stuff to import it easily
|
|
const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));
|
|
const jsdom = require('jsdom');
|
|
|
|
const sqlite3 = require('sqlite3');
|
|
const sqlite = require('sqlite');
|
|
|
|
const ConcurrentQueue = require('./concurrent-queue.js');
|
|
|
|
class ScrapeError extends Error {
|
|
constructor(message, options, fileName, lineNumber) {
|
|
super(...arguments);
|
|
this.name = 'ScrapeError';
|
|
}
|
|
}
|
|
|
|
function estMSRemaining(startTime, ratioComplete) {
|
|
return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete);
|
|
}
|
|
|
|
function formatRelative(msRelative) {
|
|
if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`;
|
|
else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`;
|
|
else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`;
|
|
else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`;
|
|
}
|
|
|
|
function sleep(ms) {
|
|
return new Promise((resolve) => {
|
|
setTimeout(resolve, ms);
|
|
});
|
|
}
|
|
|
|
// modified from background-script to use jsdom
|
|
async function scrapeUGDataContent(url) {
|
|
let page = null;
|
|
try {
|
|
page = await fetch(url);
|
|
} catch (e) {
|
|
throw new ScrapeError('Unable to fetch url', { cause: e });
|
|
}
|
|
|
|
let text = null;
|
|
try {
|
|
text = await page.text();
|
|
} catch (e) {
|
|
throw new ScrapeError('Unable to decode page', { cause: e });
|
|
}
|
|
|
|
let dom = null;
|
|
try {
|
|
dom = new jsdom.JSDOM(text);
|
|
} catch (e) {
|
|
throw new ScrapeError('Unable to parse document', { cause: e });
|
|
}
|
|
|
|
if (!dom.window || !dom.window.document) {
|
|
throw new ScrapeError('Unable to parse document');
|
|
}
|
|
|
|
let document = dom.window.document;
|
|
|
|
const jsStore = document.querySelector('.js-store');
|
|
if (jsStore == null) {
|
|
throw new ScrapeError('Unable to find .js-store element')
|
|
}
|
|
|
|
const contentJSON = jsStore.getAttribute('data-content');
|
|
if (contentJSON == null) {
|
|
throw new ScrapeError('Unable to find data-content attribute on .js-store');
|
|
}
|
|
|
|
const content = JSON.parse(contentJSON);
|
|
return content;
|
|
}
|
|
|
|
function parseGeneralTab(ugDataContent) {
|
|
const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');
|
|
const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
|
|
const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
|
|
const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab');
|
|
const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view');
|
|
const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab');
|
|
const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content');
|
|
return { meta, text };
|
|
}
|
|
|
|
// Can only fetch Bass, Chords, Drums, Tab, and Ukulele type_name tabs
|
|
async function fetchGeneralTab(url) {
|
|
let ugDataContent = await scrapeUGDataContent(url);
|
|
return parseGeneralTab(ugDataContent);
|
|
}
|
|
|
|
(async () => {
|
|
|
|
const db = await sqlite.open({
|
|
driver: sqlite3.Database,
|
|
filename: './input/tabs.db'
|
|
});
|
|
|
|
// Progress queries
|
|
|
|
let totalFetchable = (await db.get(`
|
|
SELECT
|
|
COUNT(*) AS c
|
|
FROM
|
|
tabs
|
|
WHERE
|
|
tab_url IS NOT NULL
|
|
AND type_name IS NOT NULL
|
|
AND (
|
|
type_name='Bass'
|
|
OR type_name='Chords'
|
|
OR type_name='Drums'
|
|
OR type_name='Tab'
|
|
OR type_name='Ukulele'
|
|
)
|
|
`)).c;
|
|
|
|
console.log(`${totalFetchable} Total Fetchable Tabs`)
|
|
|
|
let completedFetchable = (await db.get(`
|
|
SELECT
|
|
COUNT(*) AS c
|
|
FROM
|
|
tabs
|
|
WHERE
|
|
tab_text IS NOT NULL
|
|
AND tab_url IS NOT NULL
|
|
AND type_name IS NOT NULL
|
|
AND (
|
|
type_name='Bass'
|
|
OR type_name='Chords'
|
|
OR type_name='Drums'
|
|
OR type_name='Tab'
|
|
OR type_name='Ukulele'
|
|
)
|
|
`)).c;
|
|
|
|
console.log(`${completedFetchable} (${(100 * completedFetchable / totalFetchable).toFixed(2)}%) Fetchable Tabs already completed`);
|
|
|
|
let stmtUpdateTab = await db.prepare(`
|
|
UPDATE
|
|
tabs
|
|
SET
|
|
user_id=?1
|
|
, user_iq=?2
|
|
, username=?3
|
|
, tab_text=?4
|
|
WHERE
|
|
scrape_id=?5
|
|
`);
|
|
|
|
// nigger :)
|
|
|
|
let remainingFetchable = totalFetchable - completedFetchable;
|
|
let sessionCompleted = 0;
|
|
let startTime = Date.now();
|
|
let badUrls = new Set();
|
|
while (true) {
|
|
let queryStartTime = Date.now();
|
|
let result = await db.all(`
|
|
SELECT
|
|
scrape_id
|
|
, tab_url
|
|
FROM
|
|
tabs
|
|
WHERE
|
|
tab_text IS NULL
|
|
AND tab_url IS NOT NULL
|
|
AND type_name IS NOT NULL
|
|
AND (
|
|
type_name='Bass'
|
|
OR type_name='Chords'
|
|
OR type_name='Drums'
|
|
OR type_name='Tab'
|
|
OR type_name='Ukulele'
|
|
)
|
|
ORDER BY
|
|
bucket
|
|
LIMIT 300
|
|
`);
|
|
console.log(`SQLite Query took ${Date.now() - queryStartTime} ms`);
|
|
// console.log('Sleeping for 10s');
|
|
// await sleep(10000);
|
|
|
|
if (result.length === 0) break;
|
|
|
|
let batchCompleted = 0;
|
|
let queue = new ConcurrentQueue(5);
|
|
for (let tabInfo of result) {
|
|
if (badUrls.has(tabInfo.tab_url)) continue;
|
|
(async () => {
|
|
try {
|
|
await queue.push(async () => {
|
|
let { meta, text } = await fetchGeneralTab(tabInfo.tab_url);
|
|
let user_id = meta.user_id;
|
|
let user_iq = meta.user_iq;
|
|
let username = meta.username;
|
|
let tab_text = text;
|
|
await stmtUpdateTab.run([ user_id, user_iq, username, tab_text, tabInfo.scrape_id ]);
|
|
batchCompleted += 1;
|
|
if (batchCompleted % (Math.floor(result.length / 10)) === 0) {
|
|
console.log(`batch completed: ${batchCompleted}/${result.length - badUrls.size}`);
|
|
}
|
|
});
|
|
} catch (e) {
|
|
console.error('Error fetching tab for ', tabInfo.tab_url, '. Error:', e.message);
|
|
badUrls.add(tabInfo.tab_url);
|
|
}
|
|
})();
|
|
await sleep(100);
|
|
}
|
|
await queue.waitForDrain();
|
|
|
|
sessionCompleted += batchCompleted;
|
|
|
|
let elapsed = formatRelative(Date.now() - startTime);
|
|
let minsElapsed = (Date.now() - startTime) / (60 * 1000);
|
|
let estimatedRemaining = formatRelative(estMSRemaining(startTime, sessionCompleted / remainingFetchable));
|
|
let pctComplete = (100 * sessionCompleted / remainingFetchable);
|
|
let pctPerMin = (pctComplete / minsElapsed);
|
|
let tabsPerMin = (sessionCompleted / ((Date.now() - startTime) / (60 * 1000)));
|
|
console.log('');
|
|
console.log(`${sessionCompleted}/${remainingFetchable} tabs complete (${pctComplete.toFixed(2)}%)`);
|
|
console.log(`${tabsPerMin.toFixed(2)} tabs/min (${pctPerMin.toFixed(5)} %/min)`);
|
|
console.log(`${elapsed} elapsed (est. ${estimatedRemaining} remaining)`);
|
|
console.log('');
|
|
|
|
if (batchCompleted / result.length < .5) {
|
|
console.log('We got kicked off at ', new Date().toString());
|
|
break;
|
|
}
|
|
}
|
|
|
|
await stmtUpdateTab.finalize();
|
|
|
|
await db.close();
|
|
})();
|