feat(module): Implement full-text index of Pocket documents

This commit is contained in:
Linus Lee
2021-07-24 18:56:28 -04:00
parent 31cd6d42da
commit e85f8d204d
4 changed files with 353 additions and 22 deletions

302
lib/rejson.ink Normal file
View File

@@ -0,0 +1,302 @@
` rejson is a fork of Ink's JSON standard library that serializes lists to
arrays on a best-effort basis and orders object keys lexicographically, to
optimize for compression ratio. It's used in Monocle for space efficiency of
JSON indexes. `
std := load('../vendor/std')
str := load('../vendor/str')
quicksort := load('../vendor/quicksort')
map := std.map
cat := std.cat
sort! := quicksort.sort!
ws? := str.ws?
digit? := str.digit?
` string escape '"' `
esc := c => point(c) :: {
9 -> '\\t'
10 -> '\\n'
13 -> '\\r'
34 -> '\\"'
92 -> '\\\\'
_ -> c
}
escape := s => (
max := len(s)
(sub := (i, acc) => i :: {
max -> acc
_ -> sub(i + 1, acc + esc(s.(i)))
})(0, '')
)
` composite to JSON string `
ser := c => type(c) :: {
'()' -> 'null'
'string' -> '"' + escape(c) + '"'
'number' -> string(c)
'boolean' -> string(c)
` do not serialize functions `
'function' -> 'null'
` we assume that if the property "0" exists and property len(?) - 1 exists,
it's a list. This heuristic isn't perfect and can be fooled but works
remarkably well, especially on known data. `
'composite' -> c.0 = () | c.(len(c) - 1) = () :: {
true -> '{' + cat(sort!(map(keys(c), k => '"' + escape(k) + '":' + ser(c.(k)))), ',') + '}'
_ -> '[' + cat(map(c, v => ser(v)), ',') + ']'
}
}
` is this character a numeral digit or .? `
num? := c => c :: {
'' -> false
'.' -> true
_ -> digit?(c)
}
` reader implementation with internal state for deserialization `
reader := s => (
state := {
idx: 0
` has there been a parse error? `
err?: false
}
next := () => (
state.idx := state.idx + 1
c := s.(state.idx - 1) :: {
() -> ''
_ -> c
}
)
peek := () => c := s.(state.idx) :: {
() -> ''
_ -> c
}
{
next: next
peek: peek
` fast-forward through whitespace `
ff: () => (sub := () => ws?(peek()) :: {
true -> (
state.idx := state.idx + 1
sub()
)
})()
done?: () => ~(state.idx < len(s))
err: () => state.err? := true
err?: () => state.err?
}
)
` deserialize null `
deNull := r => (
n := r.next
n() + n() + n() + n() :: {
'null' -> ()
_ -> (r.err)()
}
)
` deserialize string `
deString := r => (
n := r.next
p := r.peek
` known to be a '"' `
n()
(sub := acc => p() :: {
'' -> (
(r.err)()
()
)
'\\' -> (
` eat backslash `
n()
sub(acc + (c := n() :: {
't' -> char(9)
'n' -> char(10)
'r' -> char(13)
'"' -> '"'
_ -> c
}))
)
'"' -> (
n()
acc
)
_ -> sub(acc + n())
})('')
)
` deserialize number `
deNumber := r => (
n := r.next
p := r.peek
state := {
` have we seen a '.' yet? `
negate?: false
decimal?: false
}
p() :: {
'-' -> (
n()
state.negate? := true
)
}
result := (sub := acc => num?(p()) :: {
true -> p() :: {
'.' -> state.decimal? :: {
true -> (r.err)()
false -> (
state.decimal? := true
sub(acc + n())
)
}
_ -> sub(acc + n())
}
false -> acc
})('')
state.negate? :: {
false -> number(result)
true -> ~number(result)
}
)
` deserialize boolean `
deTrue := r => (
n := r.next
n() + n() + n() + n() :: {
'true' -> true
_ -> (r.err)()
}
)
deFalse := r => (
n := r.next
n() + n() + n() + n() + n() :: {
'false' -> false
_ -> (r.err)()
}
)
` deserialize list `
deList := r => (
n := r.next
p := r.peek
ff := r.ff
state := {
idx: 0
}
` known to be a '[' `
n()
ff()
(sub := acc => (r.err?)() :: {
true -> ()
false -> p() :: {
'' -> (
(r.err)()
()
)
']' -> (
n()
acc
)
_ -> (
acc.(state.idx) := der(r)
state.idx := state.idx + 1
ff()
p() :: {
',' -> n()
}
ff()
sub(acc)
)
}
})([])
)
` deserialize composite `
deComp := r => (
n := r.next
p := r.peek
ff := r.ff
` known to be a '{' `
n()
ff()
(sub := acc => (r.err?)() :: {
true -> ()
false -> p() :: {
'' -> (r.err)()
'}' -> (
n()
acc
)
_ -> (
key := deString(r)
(r.err?)() :: {
false -> (
ff()
p() :: {
':' -> n()
}
ff()
val := der(r)
(r.err?)() :: {
false -> (
ff()
p() :: {
',' -> n()
}
ff()
acc.(key) := val
sub(acc)
)
}
)
}
)
}
})({})
)
` JSON string in reader to composite `
der := r => (
` trim preceding whitespace `
(r.ff)()
result := ((r.peek)() :: {
'n' -> deNull(r)
'"' -> deString(r)
't' -> deTrue(r)
'f' -> deFalse(r)
'[' -> deList(r)
'{' -> deComp(r)
_ -> deNumber(r)
})
` if there was a parse error, just return null result `
(r.err?)() :: {
true -> ()
false -> result
}
)
` JSON string to composite `
de := s => der(reader(s))

View File

@@ -11,44 +11,48 @@ const PocketDocs = require(SourceFile);
const PartialDestFile = require(DestFile);
console.log(`Found ${PocketDocs.length} docs, downloading and parsing using @mozilla/readability.`);
console.log(`Partial cache contained ${PartialDestFile.length} docs.`);
(async function() {
const FullTextDocs = [];
// Map of href to doc type
const FullTextDocs = {};
for (const doc of PartialDestFile) {
FullTextDocs[doc.href] = doc;
}
let i = 0;
for (const { title, href } of PocketDocs) {
// To make this process interruptible, we write a partial progress
// cache every 50 items.
if (i % 100 === 0) {
console.log('Writing partial file...');
writeFileSync(DestFile, JSON.stringify(FullTextDocs), 'utf8');
// cache every 25 items.
if (i % 25 === 0) {
const docsSoFar = Object.values(FullTextDocs);
console.log(`Writing partial cache with ${docsSoFar.length} docs...`);
writeFileSync(DestFile, JSON.stringify(docsSoFar), 'utf8');
}
// Skip attempting to parse media files
if (href.endsWith('.png') || href.endsWith('.jpg') ||
href.endsWith('.gif') || href.endsWith('.mp4') ||
href.endsWith('.mov') || href.endsWith('.pdf')) {
FullTextDocs.push({
FullTextDocs[href] = {
title: title,
content: href,
href: href,
});
}
i ++;
continue;
}
const alreadyParsed = PartialDestFile.filter(it => it.href === href)[0];
const alreadyParsed = FullTextDocs[href];
if (alreadyParsed) {
console.log(`Using ${href} found in partial cache...`);
FullTextDocs.push(alreadyParsed);
i ++;
continue;
} else {
console.log(`Parsing (${i + 1}/${PocketDocs.length}) ${href}...`);
}
console.log(`Parsing (${i + 1}/${PocketDocs.length}) ${href}...`);
// For a number of reasons, either JSDOM or Readability may throw if it
// fails to parse the page. In those cases, we bail and just keep the
// title + href.
@@ -67,6 +71,17 @@ console.log(`Found ${PocketDocs.length} docs, downloading and parsing using @moz
});
const page = reader.parse();
if (!page) {
FullTextDocs[href] = {
title: title,
content: href,
href: href,
}
i ++;
continue;
}
const {
title: readabilityTitle,
textContent,
@@ -75,26 +90,35 @@ console.log(`Found ${PocketDocs.length} docs, downloading and parsing using @moz
// If the page is longer than ~10k words, don't cache or index.
// It's not worth it.
if (textContent.length > 5 * 10000) throw new Error('Document too large, not caching').
if (textContent.length > 5 * 10000) {
FullTextDocs[href] = {
title: title,
content: href,
href: href,
}
FullTextDocs.push({
title: `${readabilityTitle} | ${siteName}`,
i ++;
continue;
}
FullTextDocs[href] = {
title: siteName ? `${readabilityTitle} | ${siteName}` : readabilityTitle,
content: textContent || href,
href: href,
});
}
} catch (e) {
console.log(`Error during parse of ${href} (${e})... continuing.`);
FullTextDocs.push({
FullTextDocs[href] = {
title: title,
content: href,
href: href,
});
}
}
i ++;
}
writeFileSync(DestFile, JSON.stringify(FullTextDocs), 'utf8');
console.log('done.');
writeFileSync(DestFile, JSON.stringify(Object.values(FullTextDocs)), 'utf8');
console.log('done!');
})();

View File

@@ -49,10 +49,15 @@ getDocs := withDocs => readFile(PocketExportPath, file => file :: {
_ -> (
links := deJSON(file)
docs := map(links, (link, i) => (
i % 100 :: {
0 -> log(string(i) + ' pages tokenized...')
}
{
id: 'pk' + string(i)
tokens: tokenize(link.title + ' ' + link.href + ' ' + link.content)
content: link.content
` take the first 200 words or so, so our doc index doens't blow
up completely in size `
content: slice(link.content, 0, 1000)
title: link.title
href: link.href
}

View File

@@ -1,6 +1,6 @@
std := load('../vendor/std')
str := load('../vendor/str')
json := load('../vendor/json')
json := load('../lib/rejson')
log := std.log
f := std.format