summary refs log tree commit diff
path: root/scripts/text-probability.js
blob: cc9340577164954800de13de003609e1252f7c8a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
// @ts-check

const Ty = require("../src/types")
const fs = require("fs")
const domino = require("domino")
const repl = require("repl")

const pres = (() => {
	const pres = []
	for (const file of process.argv.slice(2)) {
		const data = JSON.parse(fs.readFileSync(file, "utf8"))
		/** @type {Ty.Event.Outer<{msgtype?: string}>[]} */
		const events = data.messages
		for (const event of events) {
			if (event.type !== "m.room.message" || event.content.msgtype !== "m.text") continue
			/** @type {Ty.Event.M_Room_Message} */ // @ts-ignore
			const content = event.content
			if (content.format !== "org.matrix.custom.html") continue
			if (!content.formatted_body) continue

			const document = domino.createDocument(content.formatted_body)
			// @ts-ignore
			for (const pre of document.querySelectorAll("pre").cache) {
				const content = pre.textContent
				if (content.length < 100) continue
				pres.push(content)
			}
		}
	}
	return pres
})()

// @ts-ignore
global.gc()

/** @param {string} text */
function probablyFixedWidthIntended(text) {
	// if internal spaces are used, seems like they want a fixed-width font
	if (text.match(/[^ ] {3,}[^ ]/)) return true
	// if characters from Unicode General_Category "Symbol, other" are used, seems like they're doing ascii art and they want a fixed-width font
	if (text.match(/\p{So}/v)) return true
	// check start of line indentation
	let indents = new Set()
	for (const line of text.trimEnd().split("\n")) {
		indents.add(line.match(/^ */)?.[0].length || 0)
		// if there are more than 3 different indents (counting 0) then it's code
		if (indents.size >= 3) return true
	}
	// if everything is indented then it's code
	if (!indents.has(0)) return true
	// if there is a high proportion of symbols then it's code (this filter works remarkably well on its own)
	if ([...text.matchAll(/[\\`~;+|<>%$@*&"'=(){}[\]_^]|\.[a-zA-Z]|[a-z][A-Z]/g)].length / text.length >= 0.04) return true
	return false
}

Object.assign(repl.start().context, {pres, probablyFixedWidthIntended})

/*
if it has a lot of symbols then it's code
if it has >=3 levels of indentation then it's code
if it is all indented then it's code
if it has many spaces in a row in the middle then it's ascii art
if it has many non-latin characters then it's language
-> except if they are ascii art characters e.g. ⣿⣿⡇⢸⣿⠃ then it's ascii art
*/