mirror of
https://github.com/thesephist/monocle.git
synced 2021-07-26 21:13:15 +03:00
feat(module): Implement full-text index of Pocket documents
This commit is contained in:
302
lib/rejson.ink
Normal file
302
lib/rejson.ink
Normal file
@@ -0,0 +1,302 @@
|
||||
` rejson is a fork of Ink's JSON standard library that serializes lists to
|
||||
arrays on a best-effort basis and orders object keys lexicographically, to
|
||||
optimize for compression ratio. It's used in Monocle for space efficiency of
|
||||
JSON indexes. `
|
||||
|
||||
std := load('../vendor/std')
|
||||
str := load('../vendor/str')
|
||||
quicksort := load('../vendor/quicksort')
|
||||
|
||||
map := std.map
|
||||
cat := std.cat
|
||||
sort! := quicksort.sort!
|
||||
|
||||
ws? := str.ws?
|
||||
digit? := str.digit?
|
||||
|
||||
` string escape '"' `
|
||||
esc := c => point(c) :: {
|
||||
9 -> '\\t'
|
||||
10 -> '\\n'
|
||||
13 -> '\\r'
|
||||
34 -> '\\"'
|
||||
92 -> '\\\\'
|
||||
_ -> c
|
||||
}
|
||||
escape := s => (
|
||||
max := len(s)
|
||||
(sub := (i, acc) => i :: {
|
||||
max -> acc
|
||||
_ -> sub(i + 1, acc + esc(s.(i)))
|
||||
})(0, '')
|
||||
)
|
||||
|
||||
` composite to JSON string `
|
||||
ser := c => type(c) :: {
|
||||
'()' -> 'null'
|
||||
'string' -> '"' + escape(c) + '"'
|
||||
'number' -> string(c)
|
||||
'boolean' -> string(c)
|
||||
` do not serialize functions `
|
||||
'function' -> 'null'
|
||||
` we assume that if the property "0" exists and property len(?) - 1 exists,
|
||||
it's a list. This heuristic isn't perfect and can be fooled but works
|
||||
remarkably well, especially on known data. `
|
||||
'composite' -> c.0 = () | c.(len(c) - 1) = () :: {
|
||||
true -> '{' + cat(sort!(map(keys(c), k => '"' + escape(k) + '":' + ser(c.(k)))), ',') + '}'
|
||||
_ -> '[' + cat(map(c, v => ser(v)), ',') + ']'
|
||||
}
|
||||
}
|
||||
|
||||
` is this character a numeral digit or .? `
|
||||
num? := c => c :: {
|
||||
'' -> false
|
||||
'.' -> true
|
||||
_ -> digit?(c)
|
||||
}
|
||||
|
||||
` reader implementation with internal state for deserialization `
|
||||
reader := s => (
|
||||
state := {
|
||||
idx: 0
|
||||
` has there been a parse error? `
|
||||
err?: false
|
||||
}
|
||||
|
||||
next := () => (
|
||||
state.idx := state.idx + 1
|
||||
c := s.(state.idx - 1) :: {
|
||||
() -> ''
|
||||
_ -> c
|
||||
}
|
||||
)
|
||||
|
||||
peek := () => c := s.(state.idx) :: {
|
||||
() -> ''
|
||||
_ -> c
|
||||
}
|
||||
|
||||
{
|
||||
next: next
|
||||
peek: peek
|
||||
` fast-forward through whitespace `
|
||||
ff: () => (sub := () => ws?(peek()) :: {
|
||||
true -> (
|
||||
state.idx := state.idx + 1
|
||||
sub()
|
||||
)
|
||||
})()
|
||||
done?: () => ~(state.idx < len(s))
|
||||
err: () => state.err? := true
|
||||
err?: () => state.err?
|
||||
}
|
||||
)
|
||||
|
||||
` deserialize null `
|
||||
deNull := r => (
|
||||
n := r.next
|
||||
n() + n() + n() + n() :: {
|
||||
'null' -> ()
|
||||
_ -> (r.err)()
|
||||
}
|
||||
)
|
||||
|
||||
` deserialize string `
|
||||
deString := r => (
|
||||
n := r.next
|
||||
p := r.peek
|
||||
|
||||
` known to be a '"' `
|
||||
n()
|
||||
|
||||
(sub := acc => p() :: {
|
||||
'' -> (
|
||||
(r.err)()
|
||||
()
|
||||
)
|
||||
'\\' -> (
|
||||
` eat backslash `
|
||||
n()
|
||||
sub(acc + (c := n() :: {
|
||||
't' -> char(9)
|
||||
'n' -> char(10)
|
||||
'r' -> char(13)
|
||||
'"' -> '"'
|
||||
_ -> c
|
||||
}))
|
||||
)
|
||||
'"' -> (
|
||||
n()
|
||||
acc
|
||||
)
|
||||
_ -> sub(acc + n())
|
||||
})('')
|
||||
)
|
||||
|
||||
` deserialize number `
|
||||
deNumber := r => (
|
||||
n := r.next
|
||||
p := r.peek
|
||||
state := {
|
||||
` have we seen a '.' yet? `
|
||||
negate?: false
|
||||
decimal?: false
|
||||
}
|
||||
|
||||
p() :: {
|
||||
'-' -> (
|
||||
n()
|
||||
state.negate? := true
|
||||
)
|
||||
}
|
||||
|
||||
result := (sub := acc => num?(p()) :: {
|
||||
true -> p() :: {
|
||||
'.' -> state.decimal? :: {
|
||||
true -> (r.err)()
|
||||
false -> (
|
||||
state.decimal? := true
|
||||
sub(acc + n())
|
||||
)
|
||||
}
|
||||
_ -> sub(acc + n())
|
||||
}
|
||||
false -> acc
|
||||
})('')
|
||||
|
||||
state.negate? :: {
|
||||
false -> number(result)
|
||||
true -> ~number(result)
|
||||
}
|
||||
)
|
||||
|
||||
` deserialize boolean `
|
||||
deTrue := r => (
|
||||
n := r.next
|
||||
n() + n() + n() + n() :: {
|
||||
'true' -> true
|
||||
_ -> (r.err)()
|
||||
}
|
||||
)
|
||||
deFalse := r => (
|
||||
n := r.next
|
||||
n() + n() + n() + n() + n() :: {
|
||||
'false' -> false
|
||||
_ -> (r.err)()
|
||||
}
|
||||
)
|
||||
|
||||
` deserialize list `
|
||||
deList := r => (
|
||||
n := r.next
|
||||
p := r.peek
|
||||
ff := r.ff
|
||||
state := {
|
||||
idx: 0
|
||||
}
|
||||
|
||||
` known to be a '[' `
|
||||
n()
|
||||
ff()
|
||||
|
||||
(sub := acc => (r.err?)() :: {
|
||||
true -> ()
|
||||
false -> p() :: {
|
||||
'' -> (
|
||||
(r.err)()
|
||||
()
|
||||
)
|
||||
']' -> (
|
||||
n()
|
||||
acc
|
||||
)
|
||||
_ -> (
|
||||
acc.(state.idx) := der(r)
|
||||
state.idx := state.idx + 1
|
||||
|
||||
ff()
|
||||
p() :: {
|
||||
',' -> n()
|
||||
}
|
||||
|
||||
ff()
|
||||
sub(acc)
|
||||
)
|
||||
}
|
||||
})([])
|
||||
)
|
||||
|
||||
` deserialize composite `
|
||||
deComp := r => (
|
||||
n := r.next
|
||||
p := r.peek
|
||||
ff := r.ff
|
||||
|
||||
` known to be a '{' `
|
||||
n()
|
||||
ff()
|
||||
|
||||
(sub := acc => (r.err?)() :: {
|
||||
true -> ()
|
||||
false -> p() :: {
|
||||
'' -> (r.err)()
|
||||
'}' -> (
|
||||
n()
|
||||
acc
|
||||
)
|
||||
_ -> (
|
||||
key := deString(r)
|
||||
|
||||
(r.err?)() :: {
|
||||
false -> (
|
||||
ff()
|
||||
p() :: {
|
||||
':' -> n()
|
||||
}
|
||||
|
||||
ff()
|
||||
val := der(r)
|
||||
|
||||
(r.err?)() :: {
|
||||
false -> (
|
||||
ff()
|
||||
p() :: {
|
||||
',' -> n()
|
||||
}
|
||||
|
||||
ff()
|
||||
acc.(key) := val
|
||||
sub(acc)
|
||||
)
|
||||
}
|
||||
)
|
||||
}
|
||||
)
|
||||
}
|
||||
})({})
|
||||
)
|
||||
|
||||
` JSON string in reader to composite `
|
||||
der := r => (
|
||||
` trim preceding whitespace `
|
||||
(r.ff)()
|
||||
|
||||
result := ((r.peek)() :: {
|
||||
'n' -> deNull(r)
|
||||
'"' -> deString(r)
|
||||
't' -> deTrue(r)
|
||||
'f' -> deFalse(r)
|
||||
'[' -> deList(r)
|
||||
'{' -> deComp(r)
|
||||
_ -> deNumber(r)
|
||||
})
|
||||
|
||||
` if there was a parse error, just return null result `
|
||||
(r.err?)() :: {
|
||||
true -> ()
|
||||
false -> result
|
||||
}
|
||||
)
|
||||
|
||||
` JSON string to composite `
|
||||
de := s => der(reader(s))
|
||||
@@ -11,44 +11,48 @@ const PocketDocs = require(SourceFile);
|
||||
const PartialDestFile = require(DestFile);
|
||||
|
||||
console.log(`Found ${PocketDocs.length} docs, downloading and parsing using @mozilla/readability.`);
|
||||
console.log(`Partial cache contained ${PartialDestFile.length} docs.`);
|
||||
|
||||
(async function() {
|
||||
const FullTextDocs = [];
|
||||
// Map of href to doc type
|
||||
const FullTextDocs = {};
|
||||
for (const doc of PartialDestFile) {
|
||||
FullTextDocs[doc.href] = doc;
|
||||
}
|
||||
|
||||
let i = 0;
|
||||
for (const { title, href } of PocketDocs) {
|
||||
// To make this process interruptible, we write a partial progress
|
||||
// cache every 50 items.
|
||||
if (i % 100 === 0) {
|
||||
console.log('Writing partial file...');
|
||||
writeFileSync(DestFile, JSON.stringify(FullTextDocs), 'utf8');
|
||||
// cache every 25 items.
|
||||
if (i % 25 === 0) {
|
||||
const docsSoFar = Object.values(FullTextDocs);
|
||||
console.log(`Writing partial cache with ${docsSoFar.length} docs...`);
|
||||
writeFileSync(DestFile, JSON.stringify(docsSoFar), 'utf8');
|
||||
}
|
||||
|
||||
// Skip attempting to parse media files
|
||||
if (href.endsWith('.png') || href.endsWith('.jpg') ||
|
||||
href.endsWith('.gif') || href.endsWith('.mp4') ||
|
||||
href.endsWith('.mov') || href.endsWith('.pdf')) {
|
||||
FullTextDocs.push({
|
||||
FullTextDocs[href] = {
|
||||
title: title,
|
||||
content: href,
|
||||
href: href,
|
||||
});
|
||||
}
|
||||
|
||||
i ++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const alreadyParsed = PartialDestFile.filter(it => it.href === href)[0];
|
||||
const alreadyParsed = FullTextDocs[href];
|
||||
if (alreadyParsed) {
|
||||
console.log(`Using ${href} found in partial cache...`);
|
||||
FullTextDocs.push(alreadyParsed);
|
||||
|
||||
i ++;
|
||||
continue;
|
||||
} else {
|
||||
console.log(`Parsing (${i + 1}/${PocketDocs.length}) ${href}...`);
|
||||
}
|
||||
|
||||
console.log(`Parsing (${i + 1}/${PocketDocs.length}) ${href}...`);
|
||||
|
||||
// For a number of reasons, either JSDOM or Readability may throw if it
|
||||
// fails to parse the page. In those cases, we bail and just keep the
|
||||
// title + href.
|
||||
@@ -67,6 +71,17 @@ console.log(`Found ${PocketDocs.length} docs, downloading and parsing using @moz
|
||||
});
|
||||
|
||||
const page = reader.parse();
|
||||
if (!page) {
|
||||
FullTextDocs[href] = {
|
||||
title: title,
|
||||
content: href,
|
||||
href: href,
|
||||
}
|
||||
|
||||
i ++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const {
|
||||
title: readabilityTitle,
|
||||
textContent,
|
||||
@@ -75,26 +90,35 @@ console.log(`Found ${PocketDocs.length} docs, downloading and parsing using @moz
|
||||
|
||||
// If the page is longer than ~10k words, don't cache or index.
|
||||
// It's not worth it.
|
||||
if (textContent.length > 5 * 10000) throw new Error('Document too large, not caching').
|
||||
if (textContent.length > 5 * 10000) {
|
||||
FullTextDocs[href] = {
|
||||
title: title,
|
||||
content: href,
|
||||
href: href,
|
||||
}
|
||||
|
||||
FullTextDocs.push({
|
||||
title: `${readabilityTitle} | ${siteName}`,
|
||||
i ++;
|
||||
continue;
|
||||
}
|
||||
|
||||
FullTextDocs[href] = {
|
||||
title: siteName ? `${readabilityTitle} | ${siteName}` : readabilityTitle,
|
||||
content: textContent || href,
|
||||
href: href,
|
||||
});
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(`Error during parse of ${href} (${e})... continuing.`);
|
||||
FullTextDocs.push({
|
||||
FullTextDocs[href] = {
|
||||
title: title,
|
||||
content: href,
|
||||
href: href,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
i ++;
|
||||
}
|
||||
|
||||
writeFileSync(DestFile, JSON.stringify(FullTextDocs), 'utf8');
|
||||
console.log('done.');
|
||||
writeFileSync(DestFile, JSON.stringify(Object.values(FullTextDocs)), 'utf8');
|
||||
console.log('done!');
|
||||
})();
|
||||
|
||||
|
||||
@@ -49,10 +49,15 @@ getDocs := withDocs => readFile(PocketExportPath, file => file :: {
|
||||
_ -> (
|
||||
links := deJSON(file)
|
||||
docs := map(links, (link, i) => (
|
||||
i % 100 :: {
|
||||
0 -> log(string(i) + ' pages tokenized...')
|
||||
}
|
||||
{
|
||||
id: 'pk' + string(i)
|
||||
tokens: tokenize(link.title + ' ' + link.href + ' ' + link.content)
|
||||
content: link.content
|
||||
` take the first 200 words or so, so our doc index doens't blow
|
||||
up completely in size `
|
||||
content: slice(link.content, 0, 1000)
|
||||
title: link.title
|
||||
href: link.href
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
std := load('../vendor/std')
|
||||
str := load('../vendor/str')
|
||||
json := load('../vendor/json')
|
||||
json := load('../lib/rejson')
|
||||
|
||||
log := std.log
|
||||
f := std.format
|
||||
|
||||
Reference in New Issue
Block a user