summaryrefslogtreecommitdiff
path: root/src/detect.js
diff options
context:
space:
mode:
authorMinteck <contact@minteck.org>2021-12-21 16:52:28 +0100
committerMinteck <contact@minteck.org>2021-12-21 16:52:28 +0100
commit46e43f4bde4a35785b4997b81e86cd19f046b69b (patch)
treec53c2f826f777f9d6b2d249dab556feb72a6c3a6 /src/detect.js
downloadlangdetect-46e43f4bde4a35785b4997b81e86cd19f046b69b.tar.gz
langdetect-46e43f4bde4a35785b4997b81e86cd19f046b69b.tar.bz2
langdetect-46e43f4bde4a35785b4997b81e86cd19f046b69b.zip
Commit
Diffstat (limited to 'src/detect.js')
-rw-r--r--src/detect.js140
1 files changed, 140 insertions, 0 deletions
diff --git a/src/detect.js b/src/detect.js
new file mode 100644
index 0000000..1319fff
--- /dev/null
+++ b/src/detect.js
@@ -0,0 +1,140 @@
+splangs = require('./languages.json');
+
+function reverseObject(object) {
+ var newObject = {};
+ var keys = [];
+
+ for (var key in object) {
+ keys.push(key);
+ }
+
+ for (var i = keys.length - 1; i >= 0; i--) {
+ var value = object[keys[i]];
+ newObject[keys[i]]= value;
+ }
+
+ return newObject;
+}
+
+try {
+ old = new Date();
+ lold = new Date();
+
+ const diacritics = require('./diacritics');
+ const count = require('./counter');
+ const percentages = require('./percentages');
+ const differenciate = require('./differenciate');
+ const bestLanguage = require('./better');
+ const prettylang = require('./prettylang');
+ const wordsDetect = require('./wdetect');
+
+ fs = require('fs');
+ chalk = require('chalk');
+
+ class LangdetectError extends Error {
+ constructor(...params) {
+ super(...params)
+ // We're spreading `params` as a way to bring all of `Error`'s functionality in.
+ }
+ }
+
+ if (typeof process.argv[2] == "undefined") {
+ throw new LangdetectError("args: Missing operand");
+ }
+
+ console.log("Let's try to detect which language this is!");
+
+ lrec = new Date();
+
+ require("./status");
+
+ told = new Date();
+ console.log("Decomposing text... This will take a while for long texts!");
+ clean = diacritics(process.argv[2].toLowerCase());
+ cleaner = clean.replace(/[^a-zA-Z]/g, "");
+ cleaner2 = clean.replace(/[^a-zA-Z ]/g, "");
+ wcwords = cleaner2.split(/[, ]+/).length;
+ letters = cleaner.split("");
+ console.log("Text contains " + letters.length + " letters");
+ console.log("Calculating percentage for all letters...");
+ qty = count(letters);
+ if (qty.total != letters.length) {
+ console.log(chalk.red(chalk.bold("WARNING: ") + (letters.length - qty.total) + " characters were not included while counting all letters. This is probably a bug and should be reported."));
+ }
+ percs = percentages(qty);
+ trec = new Date();
+ tdiff = trec - told;
+ mold = new Date();
+ console.log("Getting matches for each letter in each supported language...");
+ diffs = differenciate(percs, langs);
+ mrec = new Date();
+ mdiff = mrec - mold
+
+ bold = new Date();
+ console.log("Finding the best matches between all supported languages...");
+ precounts = bestLanguage(diffs);
+ brec = new Date();
+ bdiff = brec - bold;
+
+ wold = new Date();
+ console.log("Including proportion of recognized words...");
+ newcounts = wordsDetect(precounts, cleaner2, langs);
+ wcount = newcounts["_words"];
+ delete newcounts["_words"];
+ preprecounts = Object.entries(newcounts)
+ .sort(([,a],[,b]) => a-b)
+ .reduce((r, [k, v]) => ({ ...r, [k]: v }), {});
+ counts = reverseObject(preprecounts);
+ wrec = new Date();
+ wdiff = wrec - wold;
+
+ rec = new Date();
+ ddiff = rec - old;
+ ldiff = lrec - lold;
+ console.log(chalk.green("Done in " + Math.ceil(ddiff) + " ms:\n * " + Math.ceil(ldiff) + " ms loading required files\n * " + Math.ceil(tdiff) + " ms decomposing text\n * " + Math.ceil(mdiff) + " ms gathering matches\n * " + Math.ceil(bdiff) + " ms finding the best matches\n * " + Math.ceil(wdiff) + " ms including known words\n * " + Math.ceil(ddiff - (ldiff + tdiff + mdiff + bdiff + wdiff)) + " ms doing other things"));
+
+ if (ent < 600) {
+ console.log(chalk.red(chalk.bold("WARNING: ") + "Not enough entries to have accurate results: " + ent + "/600 (" + Math.round((ent/600)*100) + "%)"));
+ }
+
+ keys = Object.keys(counts);
+ total = 0;
+ keys.forEach((key) => {
+ total = total + counts[key];
+ })
+
+ prewcount = Object.entries(wcount)
+ .sort(([,a],[,b]) => a-b)
+ .reduce((r, [k, v]) => ({ ...r, [k]: v }), {});
+ wcount = reverseObject(prewcount);
+
+ if (wcount[keys[0]] < 1) {
+ console.log(chalk.red(chalk.bold("WARNING: ") + "No words detected in text, results may not be very accurate"));
+ }
+
+ if (letters.length < 16) {
+ console.log(chalk.red(chalk.bold("WARNING: ") + "Not enough letters to get accurate results, consider adding a whole sentence instead of isolated words."));
+ }
+ if (letters.length > 1000) {
+ console.log(chalk.red(chalk.bold("WARNING: ") + "Too much letters, consider detecting only a single sentence instead of a whole document."));
+ }
+
+ console.log("\nResults:");
+ console.log(chalk.bold(" 1. " + prettylang(keys[0]) + " (" + Math.round((counts[keys[0]] / total) * 100) + "%, " + wcount[keys[0]] + "/" + wcwords + " words)"));
+ if (counts[keys[1]] > 0) {
+ console.log(" 2. " + prettylang(keys[1]) + " (" + Math.round((counts[keys[1]] / total) * 100) + "%, " + wcount[keys[1]] + "/" + wcwords + " words)");
+ } else {
+ console.log(chalk.gray(" 2. " + prettylang(keys[1]) + " (" + Math.round((counts[keys[1]] / total) * 100) + "%, " + wcount[keys[1]] + "/" + wcwords + " words)"));
+ }
+ if (counts[keys[2]] > 0) {
+ console.log(" 3. " + prettylang(keys[2]) + " (" + Math.round((counts[keys[2]] / total) * 100) + "%, " + wcount[keys[2]] + "/" + wcwords + " words)");
+ } else {
+ console.log(chalk.gray(" 3. " + prettylang(keys[2]) + " (" + Math.round((counts[keys[2]] / total) * 100) + "%, " + wcount[keys[2]] + "/" + wcwords + " words)"));
+ }
+} catch (e) {
+ if (e.message == "Unexpected end of JSON input") {
+ console.log(require('chalk').red("Unable to open database file. Is the databased opened by another program? Or is it corrupted?"));
+ } else {
+ throw e;
+ }
+} \ No newline at end of file