go-jobscraper/grabber/grabber.go
2024-04-30 08:59:23 +01:00

235 lines
5.7 KiB
Go

package grabber
import (
"fmt"
"github.com/shomali11/util/xhashes"
"io"
"log"
"net/http"
"reflect"
"regexp"
"github.com/mmcdole/gofeed"
)
var (
Version string
Build string
locationRegex = regexp.MustCompile(`Location:<\/strong><\/td><td width="50">&nbsp;<\/td><td>(.*?)(?:<\/td>)`)
rateRegex = regexp.MustCompile(`Rate:<\/strong><\/td><td width="50">&nbsp;<\/td><td>(.*?)<\/td>`)
companyRegex = regexp.MustCompile(`Advertiser:<\/strong><\/td><td width="50">&nbsp;<\/td><td>(.*?)<\/td>`)
// accept filter
acceptRegex = regexp.MustCompile(`(full\s?stack|front\s?end|html|html5|es6|react|angular|knockout|ember|vue|riotjs|css|javascript|typescript|golang|go|sql|node|backbone|git|gulp|jquery|express|£\dk|Data Warehouse Developer|iot|internet of things)\W`)
// reject filters
pattRegex = regexp.MustCompile(`(Simply Education|Splunk|Coordinators?|Teachers?|Technical Writers?|Data Analyst|WebLogic|WebSphere|Data Scientist|Change Managers?|T24|Test Analyst|Insight Analyst|application tester|senior tester|Salesforce|QlikView|Navision|Murex|seo|django|drupal|SHAREPOINT|per annum|ServiceNow|Test Lead|User Researcher|Service Management|\(PERM\)|£\d.K|Remedy|ITSM|Symfony|Zend|Full Time|Technical Business Analyst|BUSINESS ANALYST|AUTOMATION TESTER|FIELD TECHNICIAN|websphere administrator|Research Data Scientist)`)
engineersRegex = regexp.MustCompile(`((Support|Devops|Planning|security|Postgresql|network|sccm|test|data|imac|firewall|vmware)\s+Engineer)`)
developersRegex = regexp.MustCompile(`((Big Data|Java Server Side|Java|PHP|Graduate|Access|Oracle ADF|SHAREPOINT|Ruby on Rails|Java Software|IOS|Qlikview|c#|c\+\+|\.net|bi|go lang|Python)+\s+Developer+)`)
// architectsRegex = regexp.MustCompile(`(Java|PHP|Microsoft)+(?:\s)(?=Architect)`)
)
func Grab(url string) []RssItem {
log.Printf("Grabbing: %v", url)
// url := "https://www.jobserve.com/MySearch/F3A56475D5FD4966.rss"
content, err := readFeed(url)
if err != nil {
log.Printf("failed to poll feed <%s>: %v", url, err)
// continue
}
items, err := extractItems(content)
if err != nil {
log.Printf("Failed to extract items from feed %s: %v", url, err)
// log.Printf("Failed to extract items from feed %s: %v", feed.url, err)
// continue
}
log.Printf("Length %v\n", len(items))
items = rejectItems(items)
log.Printf("Length %v\n", len(items))
items = acceptItems(items)
log.Printf("Length %v\n", len(items))
return items
}
func readFeed(url string) (string, error) {
resp, err := http.Get(url)
if err != nil {
return "", fmt.Errorf("failed to request feed: %v", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("failed to read feed: %v", err)
}
return string(body), err
}
func showStruct(item any) {
val := reflect.ValueOf(item)
typ := val.Type()
for i := 0; i < val.NumField(); i++ {
field := val.Field(i)
fieldType := typ.Field(i)
fmt.Printf("Field Name: %s, Field Value: %v\n", fieldType.Name, field.Interface())
}
}
func extractItems(content string) ([]RssItem, error) {
var items []RssItem
fp := gofeed.NewParser()
feed, err := fp.ParseString(content)
if err != nil {
return items, fmt.Errorf("Failed to parse feed: %v", err)
}
for _, item := range feed.Items {
var ri RssItem
ri.Title = item.Title
if item.PublishedParsed != nil {
ri.Date = *item.PublishedParsed
}
if item.GUID != "" {
// ri.Id = item.GUID
ri.Id = xhashes.SHA1(item.GUID)
}
if item.Description != "" {
ri.Summary = item.Description
}
ri.URL = item.Link
ri = additionalProcessing(ri)
items = append(items, ri)
}
return items, nil
}
func additionalProcessing(workItem RssItem) RssItem {
if loc := locationRegex.FindStringSubmatch(workItem.Summary); loc != nil {
workItem.Location = loc[1]
}
if rate := rateRegex.FindStringSubmatch(workItem.Summary); rate != nil {
workItem.Salary = rate[1]
}
if company := companyRegex.FindStringSubmatch(workItem.Summary); company != nil {
workItem.Company = company[1]
}
return workItem
}
func acceptItems(jobitems []RssItem) []RssItem {
var items []RssItem
for _, item := range jobitems {
var accept []string
acceptable := false
if pat := acceptRegex.FindStringSubmatch(item.Title); pat != nil {
accept = append(accept, pat[0])
acceptable = true
}
if pat := acceptRegex.FindStringSubmatch(item.Summary); pat != nil {
accept = append(accept, pat[0])
acceptable = true
}
log.Printf("%v :: Accept? %v -- %v", item.Title, acceptable, accept)
if acceptable == true {
items = append(items, item)
}
}
return items
}
func rejectItems(jobitems []RssItem) []RssItem {
var items []RssItem
for _, item := range jobitems {
var rejected []string
rejectable := false
if pat := pattRegex.FindStringSubmatch(item.Title); pat != nil {
rejected = append(rejected, pat[0])
rejectable = true
}
if pat := pattRegex.FindStringSubmatch(item.Summary); pat != nil {
rejected = append(rejected, pat[0])
rejectable = true
}
if pat := engineersRegex.FindStringSubmatch(item.Title); pat != nil {
rejected = append(rejected, pat[0])
rejectable = true
}
if pat := engineersRegex.FindStringSubmatch(item.Summary); pat != nil {
rejected = append(rejected, pat[0])
rejectable = true
}
if pat := developersRegex.FindStringSubmatch(item.Title); pat != nil {
rejected = append(rejected, pat[0])
rejectable = true
}
if pat := developersRegex.FindStringSubmatch(item.Summary); pat != nil {
rejected = append(rejected, pat[0])
rejectable = true
}
log.Printf("%v :: Reject? %v -- %v", item.Title, rejectable, rejected)
if rejectable == false {
items = append(items, item)
}
}
return items
}