Commit

author: Minteck <contact@minteck.org> 2021-12-21 16:52:28 +0100
committer: Minteck <contact@minteck.org> 2021-12-21 16:52:28 +0100
commit: 46e43f4bde4a35785b4997b81e86cd19f046b69b (patch)
tree: c53c2f826f777f9d6b2d249dab556feb72a6c3a6 /src/detect.js
download: langdetect-46e43f4bde4a35785b4997b81e86cd19f046b69b.tar.gz
langdetect-46e43f4bde4a35785b4997b81e86cd19f046b69b.tar.bz2
langdetect-46e43f4bde4a35785b4997b81e86cd19f046b69b.zip
1 files changed, 140 insertions, 0 deletions
diff --git a/src/detect.js b/src/detect.js
new file mode 100644
index 0000000..1319fff
--- /dev/null
+++ b/src/detect.js
@@ -0,0 +1,140 @@
+splangs = require('./languages.json');
+
+function reverseObject(object) {
+    var newObject = {};
+    var keys = [];
+
+    for (var key in object) {
+        keys.push(key);
+    }
+
+    for (var i = keys.length - 1; i >= 0; i--) {
+        var value = object[keys[i]];
+        newObject[keys[i]]= value;
+    }
+
+    return newObject;
+}
+
+try {
+    old = new Date();
+    lold = new Date();
+
+    const diacritics = require('./diacritics');
+    const count = require('./counter');
+    const percentages = require('./percentages');
+    const differenciate = require('./differenciate');
+    const bestLanguage = require('./better');
+    const prettylang = require('./prettylang');
+    const wordsDetect = require('./wdetect');
+
+    fs = require('fs');
+    chalk = require('chalk');
+
+    class LangdetectError extends Error {
+        constructor(...params) {
+            super(...params)
+            // We're spreading `params` as a way to bring all of `Error`'s functionality in.
+        }
+    }
+
+    if (typeof process.argv[2] == "undefined") {
+        throw new LangdetectError("args: Missing operand");
+    }
+
+    console.log("Let's try to detect which language this is!");
+
+    lrec = new Date();
+
+    require("./status");
+
+    told = new Date();
+    console.log("Decomposing text... This will take a while for long texts!");
+    clean = diacritics(process.argv[2].toLowerCase());
+    cleaner = clean.replace(/[^a-zA-Z]/g, "");
+    cleaner2 = clean.replace(/[^a-zA-Z ]/g, "");
+    wcwords = cleaner2.split(/[, ]+/).length;
+    letters = cleaner.split("");
+    console.log("Text contains " + letters.length + " letters");
+    console.log("Calculating percentage for all letters...");
+    qty = count(letters);
+    if (qty.total != letters.length) {
+        console.log(chalk.red(chalk.bold("WARNING: ") + (letters.length - qty.total) + " characters were not included while counting all letters. This is probably a bug and should be reported."));
+    }
+    percs = percentages(qty);
+    trec = new Date();
+    tdiff = trec - told;
+    mold = new Date();
+    console.log("Getting matches for each letter in each supported language...");
+    diffs = differenciate(percs, langs);
+    mrec = new Date();
+    mdiff = mrec - mold
+
+    bold = new Date();
+    console.log("Finding the best matches between all supported languages...");
+    precounts = bestLanguage(diffs);
+    brec = new Date();
+    bdiff = brec - bold;
+
+    wold = new Date();
+    console.log("Including proportion of recognized words...");
+    newcounts = wordsDetect(precounts, cleaner2, langs);
+    wcount = newcounts["_words"];
+    delete newcounts["_words"];
+    preprecounts = Object.entries(newcounts)
+        .sort(([,a],[,b]) => a-b)
+        .reduce((r, [k, v]) => ({ ...r, [k]: v }), {});
+    counts = reverseObject(preprecounts);
+    wrec = new Date();
+    wdiff = wrec - wold;
+
+    rec = new Date();
+    ddiff = rec - old;
+    ldiff = lrec - lold;
+    console.log(chalk.green("Done in " + Math.ceil(ddiff) + " ms:\n * " + Math.ceil(ldiff) + " ms loading required files\n * " + Math.ceil(tdiff) + " ms decomposing text\n * " + Math.ceil(mdiff) + " ms gathering matches\n * " + Math.ceil(bdiff) + " ms finding the best matches\n * " + Math.ceil(wdiff) + " ms including known words\n * " + Math.ceil(ddiff - (ldiff + tdiff + mdiff + bdiff + wdiff)) + " ms doing other things"));
+
+    if (ent < 600) {
+        console.log(chalk.red(chalk.bold("WARNING: ") + "Not enough entries to have accurate results: " + ent + "/600 (" + Math.round((ent/600)*100) + "%)"));
+    }
+
+    keys = Object.keys(counts);
+    total = 0;
+    keys.forEach((key) => {
+        total = total + counts[key];
+    })
+
+    prewcount = Object.entries(wcount)
+        .sort(([,a],[,b]) => a-b)
+        .reduce((r, [k, v]) => ({ ...r, [k]: v }), {});
+    wcount = reverseObject(prewcount);
+
+    if (wcount[keys[0]] < 1) {
+        console.log(chalk.red(chalk.bold("WARNING: ") + "No words detected in text, results may not be very accurate"));
+    }
+
+    if (letters.length < 16) {
+        console.log(chalk.red(chalk.bold("WARNING: ") + "Not enough letters to get accurate results, consider adding a whole sentence instead of isolated words."));
+    }
+    if (letters.length > 1000) {
+        console.log(chalk.red(chalk.bold("WARNING: ") + "Too much letters, consider detecting only a single sentence instead of a whole document."));
+    }
+
+    console.log("\nResults:");
+    console.log(chalk.bold(" 1. " + prettylang(keys[0]) + " (" + Math.round((counts[keys[0]] / total) * 100) + "%, " + wcount[keys[0]] + "/" + wcwords + " words)"));
+    if (counts[keys[1]] > 0) {
+        console.log(" 2. " + prettylang(keys[1]) + " (" + Math.round((counts[keys[1]] / total) * 100) + "%, " + wcount[keys[1]] + "/" + wcwords + " words)");
+    } else {
+        console.log(chalk.gray(" 2. " + prettylang(keys[1]) + " (" + Math.round((counts[keys[1]] / total) * 100) + "%, " + wcount[keys[1]] + "/" + wcwords + " words)"));
+    }
+    if (counts[keys[2]] > 0) {
+        console.log(" 3. " + prettylang(keys[2]) + " (" + Math.round((counts[keys[2]] / total) * 100) + "%, " + wcount[keys[2]] + "/" + wcwords + " words)");
+    } else {
+        console.log(chalk.gray(" 3. " + prettylang(keys[2]) + " (" + Math.round((counts[keys[2]] / total) * 100) + "%, " + wcount[keys[2]] + "/" + wcwords + " words)"));
+    }
+} catch (e) {
+    if (e.message == "Unexpected end of JSON input") {
+        console.log(require('chalk').red("Unable to open database file. Is the databased opened by another program? Or is it corrupted?"));
+    } else {
+        throw e;
+    }
+}
+\ No newline at end of file
author	Minteck <contact@minteck.org>	2021-12-21 16:52:28 +0100
committer	Minteck <contact@minteck.org>	2021-12-21 16:52:28 +0100
commit	46e43f4bde4a35785b4997b81e86cd19f046b69b (patch)
tree	c53c2f826f777f9d6b2d249dab556feb72a6c3a6 /src/detect.js
download	langdetect-46e43f4bde4a35785b4997b81e86cd19f046b69b.tar.gz langdetect-46e43f4bde4a35785b4997b81e86cd19f046b69b.tar.bz2 langdetect-46e43f4bde4a35785b4997b81e86cd19f046b69b.zip