2017-05-30 08:23:57 +00:00
|
|
|
/**
|
|
|
|
* Created by mdonnel on 18/05/2017.
|
|
|
|
*/
|
|
|
|
let outputFile = 'jobs-special';
|
|
|
|
|
|
|
|
let log4js = require('log4js');
|
|
|
|
let logger = log4js.getLogger();
|
|
|
|
const stopwords = require('stopwords-en');
|
|
|
|
let jsonfile = require('jsonfile');
|
|
|
|
var striptags = require('striptags');
|
|
|
|
|
|
|
|
let strippedArray = [];
|
2017-05-31 14:47:53 +00:00
|
|
|
let corpus = {};
|
2017-05-30 08:23:57 +00:00
|
|
|
|
|
|
|
function getNoneStopWords(sentence) {
|
|
|
|
var common = getStopWords();
|
|
|
|
var wordArr = sentence.match(/\w+/g),
|
|
|
|
commonObj = {},
|
|
|
|
uncommonArr = [],
|
|
|
|
word, i;
|
|
|
|
|
|
|
|
for (i = 0; i < common.length; i++) {
|
|
|
|
commonObj[ common[i].trim() ] = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < wordArr.length; i++) {
|
|
|
|
word = wordArr[i].trim().toLowerCase();
|
|
|
|
if (!commonObj[word]) {
|
|
|
|
uncommonArr.push(word);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return uncommonArr;
|
|
|
|
}
|
|
|
|
|
|
|
|
function getStopWords() {
|
|
|
|
return stopwords;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function processFile(data) {
|
|
|
|
// console.log(stopwords);
|
|
|
|
let fileName = ".\\dist\\stripped.json";
|
|
|
|
console.log(fileName);
|
|
|
|
for (let item of data.items) {
|
|
|
|
let description = striptags(item.description);
|
|
|
|
let stripped = getNoneStopWords(description);
|
|
|
|
|
|
|
|
strippedArray.push(stripped);
|
|
|
|
}
|
|
|
|
|
|
|
|
//let o = JSON.stringify({s:strippedArray});
|
|
|
|
//console.log(strippedArray);
|
|
|
|
jsonfile.writeFile(fileName, strippedArray,function(err, obj) {
|
|
|
|
console.error(err);
|
|
|
|
console.log(obj);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-31 14:47:53 +00:00
|
|
|
function buildCorpus() {
|
|
|
|
console.log('buildCorpus');
|
|
|
|
console.log(strippedArray.length);
|
|
|
|
let fileName = ".\\dist\\corpus.json";
|
|
|
|
|
|
|
|
for(let i of strippedArray) {
|
|
|
|
//console.log(i);
|
|
|
|
for(let t of i) {
|
|
|
|
//console.log(t);
|
|
|
|
|
|
|
|
if (!corpus.hasOwnProperty(t)) {
|
|
|
|
corpus[t] = {score:0, common:1};
|
|
|
|
} else {
|
|
|
|
corpus[t].common++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
jsonfile.writeFile(fileName, corpus,function(err, obj) {
|
|
|
|
console.error(err);
|
|
|
|
console.log(obj);
|
|
|
|
});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
function loadCorpus() {
|
|
|
|
let fileName = ".\\dist\\corpus.json";
|
|
|
|
console.log(fileName);
|
|
|
|
|
|
|
|
let data = jsonfile.readFile(fileName, function(err, obj) {
|
|
|
|
corpus = obj;
|
|
|
|
buildCorpus();
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-30 08:23:57 +00:00
|
|
|
function go() {
|
|
|
|
|
|
|
|
//let fileName = __dirname + "\\dist\\" + outputFile + ".json";
|
|
|
|
let fileName = ".\\dist\\" + outputFile + ".json";
|
|
|
|
console.log(fileName);
|
|
|
|
|
|
|
|
let data = jsonfile.readFile(fileName, function(err, obj) {
|
|
|
|
processFile(obj);
|
2017-05-31 14:47:53 +00:00
|
|
|
loadCorpus();
|
2017-05-30 08:23:57 +00:00
|
|
|
});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
go();
|
|
|
|
|