jobscraper/lib/corpus.js
2020-09-10 19:42:17 +01:00

92 lines
2.7 KiB
JavaScript

const jsonfile = require('jsonfile');
const words = require('../lib/wordlist.json');
const wordsAdditional = require('../lib/wordlistAdditional.json');
const bigList = new Map([]);
const goodWords = ['tsql', 'developer', 'contract', 'web', 'javascript', 'js', 'node', 'es',
'agile', 'nodejs', 'london', 'aws', 'sql', 'postgresql', 'mysql', 'docker', 'ecs',
'automation', 'jslint', 'jshint', 'vuejs', 'vue', 'nginx', 'remotely', 'mvc', 'remote',
'iot', 'mqtt'];
const badWords = ['react', 'redux', 'graphql', 'java', 'reactjs', 'shopify'];
let unrated = [];
var _global = typeof global === 'undefined' ? window : global;
var Corpus = (_global.Corpus = _global.Corpus || {});
const emailRegex = /[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?/;
const detagRegex = /(<script(\s|\S)*?<\/script>)|(<style(\s|\S)*?<\/style>)|(<!--(\s|\S)*?-->)|(<\/?(\s|\S)*?>)/gi;
const desymbolNumberRegex = /[\n\t+$,\?\.\%\*=&:;()\\/\-£…"]|\d+/gi;
const deSpace = /\s+/g;
function cleanText(intext) {
if (arguments.length === 0 || typeof intext === 'undefined' || intext === null ) return '';
return intext.replace(emailRegex, ' ').replace(detagRegex, ' ').replace(desymbolNumberRegex, ' ').replace(deSpace, ' ').trim().toLowerCase();
}
function dedupe(intext) {
if (arguments.length === 0 || intext === null ) return [];
return [...new Set(intext)];
}
function incItem(item) {
if (bigList.has(item))
bigList.set(item, bigList.get(item) + 1);
else
bigList.set(item, 1);
}
/**
* Process the body
* @param intext
* @returns {{score: number, bad: *, good: *}}
*/
Corpus.process = function(intext) {
const workText = cleanText(intext);
const workArray = workText.split(' ');
const cleanedArray = dedupe(workArray).filter((v) => {
return (words.indexOf(v) === -1 && wordsAdditional.indexOf(v) === -1);
});
const good = cleanedArray.filter((v) => {
return (goodWords.indexOf(v) !== -1);
});
const bad = cleanedArray.filter((v) => {
return (badWords.indexOf(v) !== -1);
});
const unused = cleanedArray.filter((v) => {
return ((badWords.indexOf(v) === -1) && (goodWords.indexOf(v) === -1));
});
cleanedArray.map((item) => {
incItem(item);
});
unrated = [...unrated, ...unused];
const score = good.length - (bad.length * 5);
// console.log('unused', unused);
return { good, bad, score, 'words':cleanedArray };
};
Corpus.exportUnused = function() {
jsonfile.writeFileSync('./unused.json', dedupe(unrated));
jsonfile.writeFileSync('./biglist.json', [...bigList].sort((a, b) => b[1] - a[1]));
console.log([...bigList]);
};
if (typeof module !== 'undefined')
module.exports = {
'Corpus': Corpus
};