/** * Created by mdonnel on 18/05/2017. */ let outputFile = 'jobs-special'; let log4js = require('log4js'); let logger = log4js.getLogger(); const stopwords = require('stopwords-en'); let jsonfile = require('jsonfile'); var striptags = require('striptags'); let strippedArray = []; let corpus = {}; function getNoneStopWords(sentence) { var common = getStopWords(); var wordArr = sentence.match(/\w+/g), commonObj = {}, uncommonArr = [], word, i; for (i = 0; i < common.length; i++) { commonObj[ common[i].trim() ] = true; } for (i = 0; i < wordArr.length; i++) { word = wordArr[i].trim().toLowerCase(); if (!commonObj[word]) { uncommonArr.push(word); } } return uncommonArr; } function getStopWords() { return stopwords; } function processFile(data) { // console.log(stopwords); let fileName = ".\\dist\\stripped.json"; console.log(fileName); for (let item of data.items) { let description = striptags(item.description); let stripped = getNoneStopWords(description); strippedArray.push(stripped); } //let o = JSON.stringify({s:strippedArray}); //console.log(strippedArray); jsonfile.writeFile(fileName, strippedArray,function(err, obj) { console.error(err); console.log(obj); }); } function buildCorpus() { console.log('buildCorpus'); console.log(strippedArray.length); let fileName = ".\\dist\\corpus.json"; for(let i of strippedArray) { //console.log(i); for(let t of i) { //console.log(t); if (!corpus.hasOwnProperty(t)) { corpus[t] = {score:0, common:1}; } else { corpus[t].common++; } } } jsonfile.writeFile(fileName, corpus,function(err, obj) { console.error(err); console.log(obj); }); } function loadCorpus() { let fileName = ".\\dist\\corpus.json"; console.log(fileName); let data = jsonfile.readFile(fileName, function(err, obj) { corpus = obj; buildCorpus(); }); } function go() { //let fileName = __dirname + "\\dist\\" + outputFile + ".json"; let fileName = ".\\dist\\" + outputFile + ".json"; console.log(fileName); let data = jsonfile.readFile(fileName, function(err, obj) { processFile(obj); loadCorpus(); }); } go();