const jsonfile = require('jsonfile'); const words = require('../lib/wordlist.json'); const wordsAdditional = require('../lib/wordlistAdditional.json'); const bigList = new Map([]); const goodWords = ['tsql', 'developer', 'contract', 'web', 'javascript', 'js', 'node', 'es', 'agile', 'nodejs', 'london', 'aws', 'sql', 'postgresql', 'mysql', 'docker', 'ecs', 'automation', 'jslint', 'jshint', 'vuejs', 'vue', 'nginx', 'remotely', 'mvc', 'remote', 'iot', 'mqtt']; const badWords = ['react', 'redux', 'graphql', 'java', 'reactjs', 'shopify']; let unrated = []; var _global = typeof global === 'undefined' ? window : global; var Corpus = (_global.Corpus = _global.Corpus || {}); const emailRegex = /[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?/; const detagRegex = /()|()|()|(<\/?(\s|\S)*?>)/gi; const desymbolNumberRegex = /[\n\t+$,\?\.\%\*=&:;()\\/\-£…"]|\d+/gi; const deSpace = /\s+/g; function cleanText(intext) { if (arguments.length === 0 || typeof intext === 'undefined' || intext === null ) return ''; return intext.replace(emailRegex, ' ').replace(detagRegex, ' ').replace(desymbolNumberRegex, ' ').replace(deSpace, ' ').trim().toLowerCase(); } function dedupe(intext) { if (arguments.length === 0 || intext === null ) return []; return [...new Set(intext)]; } function incItem(item) { if (bigList.has(item)) bigList.set(item, bigList.get(item) + 1); else bigList.set(item, 1); } /** * Process the body * @param intext * @returns {{score: number, bad: *, good: *}} */ Corpus.process = function(intext) { const workText = cleanText(intext); const workArray = workText.split(' '); const cleanedArray = dedupe(workArray).filter((v) => { return (words.indexOf(v) === -1 && wordsAdditional.indexOf(v) === -1); }); const good = cleanedArray.filter((v) => { return (goodWords.indexOf(v) !== -1); }); const bad = cleanedArray.filter((v) => { return (badWords.indexOf(v) !== -1); }); const unused = cleanedArray.filter((v) => { return ((badWords.indexOf(v) === -1) && (goodWords.indexOf(v) === -1)); }); cleanedArray.map((item)=> { incItem(item); }); unrated = [...unrated, ...unused]; const score = good.length - (bad.length * 5); // console.log('unused', unused); return { good, bad, score, 'words':cleanedArray }; }; Corpus.exportUnused = function() { jsonfile.writeFileSync('./unused.json', dedupe(unrated)); jsonfile.writeFileSync('./biglist.json', [...bigList].sort((a, b) => b[1] - a[1])); console.log([...bigList]); }; if (typeof module !== 'undefined') module.exports = { 'Corpus': Corpus };