NaziGramatical

Correcção ortográfica e gramatical automática em português europeu.

// ==UserScript==
// @name         NaziGramatical
// @namespace    http://nazigramatical.x10.bz/
// @version      0.6
// @description  Correcção ortográfica e gramatical automática em português europeu.
// @author       NaziGramatical
// @match        https://www.reddit.com/*
// @grant        GM_setValue
// @grant        GM_getValue
// @grant        GM_xmlhttpRequest
// @grant        GM_getResourceText
// @connect      nazigramatical.x10.bz
// @resource     pt.dic http://nazigramatical.x10.bz/pt.dic
// ==/UserScript==

(function() {
    'use strict';

    var parsedRules = [];
    var lastReplace = {count: 0};
    var dicWordsCache = {};
    var dic = false;
    var lastCheck = 0;
    var undos = 0;
    var disabled = false;

    var log = function () { /*console.log.apply(this, arguments);*/ };

    var loadRules = function () {
        var savedRules = GM_getValue('rules', []);
        for (var i = 0; i < savedRules.length; i++) {
            var regs = savedRules[i][0].match(/\/(.+)\/([^/]+)/);
            parsedRules.push([new RegExp(regs[1], regs[2]), savedRules[i][1]]);
        }
    };

    var dicWordExistsCS = function (word) {
        var time = Date.now();
        var result;
        if (dicWordsCache[word] !== undefined) result = dicWordsCache[word];
        else {
            result = (dic.indexOf("\n" + word + "\n") != -1);
        }
        log('Dic search: ' + word + ' Time: ' + (Date.now() - time) + ' ms. Result: ' + result);
        return result;
    };
    var dicWordExists = function (word) {
        if (dicWordExistsCS(word)) return true;
        var wordLC = word.toLowerCase();
        return word != wordLC && dicWordExistsCS(wordLC);
    };

    var dicCheckSentence = function (str) {
        var words = str.match(/[a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ]{3,}/g);
        if (words) {
            for (var i = 0; i < words.length; i++) {
                if (!dicWordExists(words[i])) return false;
            }
        }
        return true;
    };

    var saveRules = function () {
        var savedRules = [];
        for (var i = 0; i < parsedRules.length; i++) {
            if (typeof parsedRules[i][1] != 'string') continue;
            savedRules.push([parsedRules[i][0].toString(), parsedRules[i][1]]);
        }
        GM_setValue('rules', savedRules);
    };

    var replaceMultiple = function (str, a, b) {
        for (var i = 0; i < a.length; i++) str = str.replace(a[i], b[i]);
        return str;
    };

    var regexGroups = function (regex) {
        return regex.replace(/\\pL/g, '[a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ]')
            .replace(/\\pBL/g, '(?<=^|[^a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ])')
            .replace(/\\pBR/g, '(?=$|[^a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ])')
            .replace(/\\pV/g, '[aeiouáâãéêíóôõúÁÂÃÉÊÍÓÔÕÚ]')
            .replace(/\\pVE/g, '(?:e|é|ê)')
            .replace(/\\pC/g, '[b-df-hj-np-tv-zçÇ]')
            .replace(/\\pNUM/g, '(?:[0-9]+(?:[,.][0-9]+)?|uma?|dois|duas|tr[êe]s|quatro|cinco|seis|sete|oito|nove|dez|onze|doze|treze|catorze|quinze|dezasseis|dezassete|dezoito|dezanove|vinte|trinta|quarenta|cinquenta|sessenta|setenta|oitenta|noventa|cem|duzentos|trezentos|quatrocentos|quinhentos|seiscentos|setecentos|oitocentos|novezentos|mil)');
    };

    var parseRules = function (rules) {
        var parsedRules = [];
        for (var i = 0; i < rules.length; i++) {
            var rule = regexGroups(rules[i][0].toString());
            var replace = (typeof rules[i][1] == 'string') ?
                '$1' + rules[i][1].replace(/\$(\d+)/g, function (match, p1) { return '$' + (Number(p1) + 1); }) :
            (function (cb) {
                return (function () {
                    var args = [];
                    for (var i = 0; i < arguments.length; i++) args.push(arguments[i] ? arguments[i] : '');
                    var r = cb([args[1] ? args[0].substr(1) : args[0]].concat(args.slice(2, -2)));
                    return r ? args[1] + r : '';
                });
            })(rules[i][1]);
            var flags = '';
            var regs = rule.match(/^\/(.+)\/(.*?)$/);
            if (regs) {
                rule = regs[1];
                flags = regs[2];
            }
            var regs = rule.match(/^\(\?<([!=])(.+?)\)(.+)/);
            if (regs) {
                //log('Lookbehind rule:', rule);
                if (regs[1] == '=') rule = '(^|[^a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ]|'+regs[2]+')' + regs[3] + '(?=[^]$)';
                if (regs[1] == '!') {
                    if (/[^a-zA-Z0-9 ]/.test(regs[2])) {
                        log('Ignored imparsable rule with lookbehind:', rule);
                        continue;
                    }
                    rule = '(?!'+ regs[2] +')(.{'+regs[2].length+'}|^.{0,'+(regs[2].length-1)+'})' + regs[3] + '(?=[^]$)';
                }
                //log('Lookbehind rule parsed:', rule);
            }
            else if (/\(\?</.test(rule)) {
                //log('Ignored rule with lookbehind:', rule);
                continue;
            }
            else rule = '(^|[^a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ])' + rule + '(?=[^]$)';
            if (!flags) flags = 'i';
            parsedRules.push([new RegExp(rule, flags), replace]);
        }
        //log("ParsedRules: ", parsedRules);
        return parsedRules;
    };

    var isEnglish = function (text) {
        var regs = text.match(/(?:^| )(the|is|was|are|be|were|been|did|to|of|and|in|that|have|had|i|it|not|on|with|he|you|at|this|but|his|by|from|they|we|say|her|she|or|an|will|my|one|all|would|there|their|what|up|out|if|about|who|get|which|go|when|make|can|like|time|just|him|know|take|people|into|year|your|good|some|could|them|see|other|than|then|now|look|only|its|over|think|also|back|after|two|how|our|work|first|well|way|even|new|want|because|any|these|give|day|most|thanks?)(?=$|[^a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ])/ig);
        var count = regs ? regs.length : 0;
        if (count) {
            var words = text.match(/[a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ\'-]+/g);
            log("English words: "+count+" Total words: "+words.length+" Ratio: "+(count/words.length));
            return count / words.length > 0.1;
        }
        log("No English words found.");
        return false;
    };

    var isPortuguese = function (text) {
        var words = text.match(/[a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ]{2,}/g);
        var errors = 0;
        if (words) {
            words = words.slice(-10);
            for (var i = 0; i < words.length; i++) {
                if (!dicWordExists(words[i])) errors++;
            }
            log("Portuguese words: "+(words.length - errors)+" Total words: "+words.length+" Non Portuguese Ratio: "+(errors / words.length));
            return errors / words.length <= 0.25;
        }

        return false;
    };

    var checkLanguage = function (text) {
        text = text.replace(/https?:\/\/[^ )]+/g, '');
        text = text.replace(/[ru]\/[a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ0-9_-]+/g, '');
        text = text.replace(/«.+?»/g, '');
        text = text.replace(/".+?"/g, '');
        text = text.replace(/\*\*(.+?)\*\*/g, '$1'); // remove bold
        text = text.replace(/\*.+?\*/g, '');
        text = text.replace(/[a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ-]+[^]$/, ''); // remove last word (including compound words)
        // get last 10 words longer than 2 chars
        var match = text.match(/([a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ]+[^a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ]+([a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ]{1,1}[^a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ]+)?){0,10}$/);
        if (!match) log('Error: no match on checkLanguage:', text);
        else text = match[0];
        log('checkLanguage', text);
        return isPortuguese(text);
        //return !isEnglish(text) && isPortuguese(text);
    };

    var scanLine = function (line) {
        log(`scanline: "${line}"`);
        var result = '';
        for (var i = 0; i < parsedRules.length; i++) {
            result = checkWord(line, parsedRules[i][0], parsedRules[i][1]);
            if (result) break;
        }
        log(`result: "${result}"`);
        //hyphenated word
        var match = (result || line).match(/^(.+-)([a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ]+[^])$/);
        if (match) {
            var result2 = scanLine(match[1]);
            if (result2) result = result2 + match[2];
        }
        return result;
    };

    var fixCase = function (str1, str2) {
        //log(`fixCase: "${str1}" "${str2}"`);
        var isUpperCase = function (s) { return (s == s.toUpperCase()); };
        var isLowerCase = function (s) { return (s == s.toLowerCase()); };
        var toTitleCase = function (s) { return s.charAt(0).toUpperCase() + s.substr(1); };
        var isTitleCase = function (s) { return (s == toTitleCase(s)); };

        if (isLowerCase(str1)) return str2;
        if (isUpperCase(str1)) return str2.toUpperCase();
        if (isTitleCase(str1)) return toTitleCase(str2);
        return str2;
    };

    var checkWord = function (line, regex, replace) {
        //log("Line: '"+line+"' Regex: "+regex+" Replace: "+replace);
        if (regex.test(line)) {
            var match = line.match(regex);
            var index = match.index ? match.index + 1 : 0;
            var replaced = line.replace(regex, replace).slice(index, -1);
            //log(`replaced: '${replaced}'`);
            if (replaced) {
                log("Line: '"+line+"'");
                log("Regex: "+regex+" Replace: "+replace+" Match:", match);
                replaced = fixCase(line.slice(index, -1), replaced);
                //log(`replaced: '${replaced}'`);
                //log('check correction:', dicCheckSentence(replaced));
                return line.slice(0, index) + replaced + line.slice(-1);
            }
        }
    };

    var removeDiacritics = function (str) {
        (function(a, b) { for (var i = 0; i < a.length; i++) str = str.replace(new RegExp(a[i], 'g'), b[i]); })(
            ['á', 'â', 'é', 'ê', 'í', 'ó', 'ô', 'ú', 'Á', 'Â', 'É', 'Ê', 'Í', 'Ó', 'Ô', 'Ú'],
            ['a', 'a', 'e', 'e', 'i', 'o', 'o', 'u', 'A', 'A', 'E', 'E', 'I', 'O', 'O', 'U']);
        return str;
    };

    var tryCorrection = function (w1, w2) {
        log(`tryCorrection "${w1}" "${w2}"`);
        if (!dicWordExists(w1) && dicWordExists(w2)) return w2;
    };

    document.addEventListener('input', function (e) {
        if (disabled) return;
        if (!dic) {
            dic = {};
            setTimeout(function () {
                var time = Date.now();
                dic = "\n"+GM_getResourceText('pt.dic')+"\n";
                log('Loaded dic in ' + (Date.now() - time) + ' ms.');
            }, 100);
            return;
        }
        lastReplace.count++;
        var input = e.target;
        var text = input.value;
        if (!text) return; // no text
        var sep = text.substr(input.selectionEnd - 1, 1);
        if (/[a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ-]/.test(sep)) return;
        var left = text.substr(0, input.selectionEnd);
        var leftLines = left.match(/.+(\n+|$)/g);
        var line = leftLines.pop();
        var right = text.substring(input.selectionEnd, text.length);
        if (/^ *>/.test(line)) return; // quotes
        if (/https?:\/\/[^ )]+[^]$/.test(line)) return; // urls
        if (/[ru]\/[a-zA-ZáâãéêíóôõúçÁÂÃÉÊÍÓÔÕÚÇ0-9_-]+[^]$/.test(line)) return; // users/subs
        if (/«[^»]+[^]$/.test(line)) return; // quoted
        if (/^[^"]*"[^"]*([^"]*"[^"]*"[^"]*)*[^]$/.test(line)) return; // quoted
        if (/^[^*]*\*[^*]*([^*]*\*[^*]*\*[^*]*)*[^]$/.test(line.replace(/\*\*.+?\*\*/g, ''))) return; // italic
        if (!checkLanguage(left)) return;
        lastCheck = Date.now(); // prevent cleanup
        log(`Checking: "${line}"`);
        var result = scanLine(line);
        if (!result) return;
        lastReplace = {input: input, value: input.value, selectionEnd: input.selectionEnd, count: 0};
        log('Replacing "'+line+'" with "'+result+'"');
        line = result;
        leftLines.push(line);
        left = leftLines.join('');
        input.value = left + right;
        input.selectionEnd = left.length;
    });

    document.addEventListener('keydown', function(e) {
        if (e.ctrlKey && e.keyCode == 90) {
            if  (!(lastReplace.count--)) {
                log("NaziGramatical: Undo.");
                lastReplace.input.value = lastReplace.value;
                lastReplace.input.selectionEnd = lastReplace.selectionEnd;
                if (++undos == 2 && confirm('The spellchecker will be disabled in this page.')) disabled = true;
                e.preventDefault();
            }
            lastReplace.count--;
        }
    });

    // update rules
    if (Date.now() - GM_getValue('rulesTime', 0) > GM_getValue('updateInterval', 0) * 1000) {
        log("NaziGramatical: Updating rules.");
        GM_xmlhttpRequest({
            method: 'GET',
            url: 'http://nazigramatical.x10.bz/rules.php',
            onload: function (response) {
                var data = JSON.parse(response.responseText);
                parsedRules = parsedRules.concat(parseRules(data.rules));
                GM_setValue('updateInterval', Math.min(604800, data.updateInterval));
                GM_setValue('dic.index', data.dic);
                saveRules();
                log("NaziGramatical: Rules updated. Next update: "+data.updateInterval+" secs.");
            }
        });
        GM_setValue('rulesTime', Date.now());
    }
    // load stored rules
    else loadRules();

    // static rules
    (function () {
        var rules = [
            [/(\pL*[áâéêíóôú]\pL*)(mente|zinh[ao]s?)/, function (r) { return removeDiacritics(r[1]) + r[2]; }], //sózinho, sómente
            [/(in)?d([ei])s(\pL{3,})/, function (r) { return tryCorrection(r[0], r[1] + (r[2] == 'e' ? 'dis' : 'des') + r[3]); }], //destraído, distoar
            [/(\pL+i)ss(es?)/, function (r) { return tryCorrection(r[0], r[1]+'c'+r[2]); }], //chatisse
            [/(\pL*[aeiou])([íú])([zlr])/, function (r) { return tryCorrection(r[0], r[1]+removeDiacritics(r[2])+r[3]); }], //saír
            [/(\pL*)([áâéêíóôú])([zlr])/, function (r) { return tryCorrection(r[0], r[1]+removeDiacritics(r[2])+r[3]); }], //metêr, cristál
            [/(\pL*[^aeiou])([áéíóúâêô])(\pC+[aeo](?:s|m|ns)?)/, function (r) { return tryCorrection(r[0], r[1]+removeDiacritics(r[2])+r[3]); }], //fála, páras
            [/(\pL*[^aeiou])([íú])(s|m|ns)?/, function (r) { return tryCorrection(r[0], r[1]+removeDiacritics(r[2])+r[3]); }], //perú, patíns
            //[/([aeiou])([iu]?\pC+[aeiou][oea]s?)/, function (r) { return tryCorrection(r[0], r[1]+removeDiacritics(r[2])+r[3]); }],
            [/(\pL+)a([eo]s?)/, function (r) { return tryCorrection(r[0], r[1]+'ã'+r[2]); }], //mao, mae, maos, maes
            [/(\pL+)o(es?)/, function (r) { return tryCorrection(r[0], r[1]+'õ'+r[2]); }], //poe, poes
            [/(\pL+)c[aã]([eo]s?)/, function (r) { return tryCorrection(r[0], r[1]+'çã'+r[2]); }], //bêncao, bêncão, bêncaos, bêncãos, opcoes, opcões
            [/(\pL+)c[oõ](es?)/, function (r) { return tryCorrection(r[0], r[1]+'çõ'+r[2]); }], //licoes, licões,
            [/(\pL*\pV)[cp]([tcç]\pV\pL*)/, function (r) { return tryCorrection(r[0], (r[1] + r[2]).replace(/m(?=[cçt])/i, 'n')); }], // inflacção
            // enclise
            [/(((\pL+)([aeiouô]))([srz]))-([oa]s?)(?!-)/, function (r) {
                if (dicWordExists(r[1])) {
                    var replace = '';
                    if (r[1] == 'quer') replace = 'quere-' + r[6];
                    else if (r[5] == 's') {
                        replace = r[2] + '-l' + r[6];
                    }
                    else {
                        replace = r[3] + r[4].replace('a', 'á').replace('e', 'ê').replace('o', 'ô') + '-l' + r[6];
                    }
                    return replace;
                }
            }],
            // hyphen
            [/(\pL{2,}?)(s?)-(\pL+)/, function (r) {
                if (!dicWordExists(r[1] + r[2]) || !dicWordExists(r[3])) {
                    var left = r[1].slice(0, -1) + removeDiacritics(r[1].slice(-1)) + r[2];
                    var word = left + (/[aeiou]$/i.test(left) && /^[sr][aeiouáâãéêíóôõú]/i.test(r[3]) ? r[3].replace(/^([sr])/, '$1$1') : r[3]);
                    if (dicWordExists(word)) return word;
                }
                else if (
                    (/^(contra|extra|auto|neo|proto|pseudo)$/i.test(r[1]+r[2]) && !/^[aeiouáâãéêíóôõúhrs]/i.test(r[3])) ||
                    (/^(anti|arqui|semi)$/i.test(r[1]+r[2]) && !/^[iíhrs]/i.test(r[3])) ||
                    (/^(ante|entre|sobre)$/i.test(r[1]+r[2]) && !/^h/i.test(r[3])) ||
                    (/^(hiper|inter|super)$/i.test(r[1]+r[2]) && !/^[hr]/i.test(r[3])) ||
                    (/^(com|mal)$/i.test(r[1]+r[2]) && !/^[aeiouáâãéêíóôõúh]/i.test(r[3]))
                ) {
                    var word = r[1]+r[2]+r[3];
                    if (dicWordExists(word)) return word;
                }
            }],
            // brute force
            [/(\pL+)/, function (r) {
                if (!dicWordExists(r[0])) {
                    var time = Date.now();
                    var rule = replaceMultiple(r[0], [/a/gi, /e/gi, /i/gi, /o/gi, /u/gi, /c/gi], ['[aáâã]', '[eéê]', '[ií]', '[oóôõ]', '[uú]', '[cç]']);
                    var regex = new RegExp('\\n' + rule + '\\n', 'gi');
                    var match = dic.match(regex);
                    log('Possible matches:', match, 'Time took:', Date.now() - time);
                    if (match && match.length == 1) return match[0].replace(/\n/g, '');
                }
            }]
        ];
        parsedRules = parsedRules.concat(parseRules(rules));
    })();

    // clean dictionary when it's not needed anymore
    setInterval(function () {
        if (Date.now() - lastCheck > 25000) {
            dic = false;
        }
    }, 10000);
})();