92 lines
2.7 KiB
JavaScript
92 lines
2.7 KiB
JavaScript
const jsonfile = require('jsonfile');
|
|
|
|
const words = require('../lib/wordlist.json');
|
|
const wordsAdditional = require('../lib/wordlistAdditional.json');
|
|
|
|
const bigList = new Map([]);
|
|
|
|
const goodWords = ['tsql', 'developer', 'contract', 'web', 'javascript', 'js', 'node', 'es',
|
|
'agile', 'nodejs', 'london', 'aws', 'sql', 'postgresql', 'mysql', 'docker', 'ecs',
|
|
'automation', 'jslint', 'jshint', 'vuejs', 'vue', 'nginx', 'remotely', 'mvc', 'remote',
|
|
'iot', 'mqtt'];
|
|
const badWords = ['react', 'redux', 'graphql', 'java', 'reactjs', 'shopify'];
|
|
let unrated = [];
|
|
|
|
var _global = typeof global === 'undefined' ? window : global;
|
|
var Corpus = (_global.Corpus = _global.Corpus || {});
|
|
|
|
const emailRegex = /[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?/;
|
|
const detagRegex = /(<script(\s|\S)*?<\/script>)|(<style(\s|\S)*?<\/style>)|(<!--(\s|\S)*?-->)|(<\/?(\s|\S)*?>)/gi;
|
|
const desymbolNumberRegex = /[\n\t+$,\?\.\%\*=&:;()\\/\-£…"]|\d+/gi;
|
|
const deSpace = /\s+/g;
|
|
|
|
function cleanText(intext) {
|
|
if (arguments.length === 0 || typeof intext === 'undefined' || intext === null ) return '';
|
|
|
|
return intext.replace(emailRegex, ' ').replace(detagRegex, ' ').replace(desymbolNumberRegex, ' ').replace(deSpace, ' ').trim().toLowerCase();
|
|
}
|
|
|
|
function dedupe(intext) {
|
|
if (arguments.length === 0 || intext === null ) return [];
|
|
|
|
return [...new Set(intext)];
|
|
}
|
|
|
|
function incItem(item) {
|
|
if (bigList.has(item))
|
|
bigList.set(item, bigList.get(item) + 1);
|
|
|
|
else
|
|
bigList.set(item, 1);
|
|
}
|
|
|
|
/**
|
|
* Process the body
|
|
* @param intext
|
|
* @returns {{score: number, bad: *, good: *}}
|
|
*/
|
|
Corpus.process = function(intext) {
|
|
const workText = cleanText(intext);
|
|
|
|
const workArray = workText.split(' ');
|
|
|
|
const cleanedArray = dedupe(workArray).filter((v) => {
|
|
return (words.indexOf(v) === -1 && wordsAdditional.indexOf(v) === -1);
|
|
});
|
|
|
|
const good = cleanedArray.filter((v) => {
|
|
return (goodWords.indexOf(v) !== -1);
|
|
});
|
|
|
|
const bad = cleanedArray.filter((v) => {
|
|
return (badWords.indexOf(v) !== -1);
|
|
});
|
|
|
|
const unused = cleanedArray.filter((v) => {
|
|
return ((badWords.indexOf(v) === -1) && (goodWords.indexOf(v) === -1));
|
|
});
|
|
|
|
cleanedArray.map((item) => {
|
|
incItem(item);
|
|
});
|
|
|
|
unrated = [...unrated, ...unused];
|
|
|
|
const score = good.length - (bad.length * 5);
|
|
|
|
// console.log('unused', unused);
|
|
|
|
return { good, bad, score, 'words':cleanedArray };
|
|
};
|
|
|
|
Corpus.exportUnused = function() {
|
|
jsonfile.writeFileSync('./unused.json', dedupe(unrated));
|
|
jsonfile.writeFileSync('./biglist.json', [...bigList].sort((a, b) => b[1] - a[1]));
|
|
console.log([...bigList]);
|
|
};
|
|
|
|
if (typeof module !== 'undefined')
|
|
module.exports = {
|
|
'Corpus': Corpus
|
|
};
|