2024-04-26 16:13:07 +00:00
package grabber
import (
"fmt"
"github.com/shomali11/util/xhashes"
"io"
"log"
"net/http"
"reflect"
"regexp"
"github.com/mmcdole/gofeed"
)
var (
Version string
Build string
locationRegex = regexp . MustCompile ( ` Location:<\/strong><\/td><td width="50"> <\/td><td>(.*?)(?:<\/td>) ` )
rateRegex = regexp . MustCompile ( ` Rate:<\/strong><\/td><td width="50"> <\/td><td>(.*?)<\/td> ` )
companyRegex = regexp . MustCompile ( ` Advertiser:<\/strong><\/td><td width="50"> <\/td><td>(.*?)<\/td> ` )
// accept filter
acceptRegex = regexp . MustCompile ( ` (full\s?stack|front\s?end|html|html5|es6|react|angular|knockout|ember|vue|riotjs|css|javascript|typescript|golang|go|sql|node|backbone|git|gulp|jquery|express|£\dk|Data Warehouse Developer|iot|internet of things)\W ` )
// reject filters
pattRegex = regexp . MustCompile ( ` (Simply Education|Splunk|Coordinators?|Teachers?|Technical Writers?|Data Analyst|WebLogic|WebSphere|Data Scientist|Change Managers?|T24|Test Analyst|Insight Analyst|application tester|senior tester|Salesforce|QlikView|Navision|Murex|seo|django|drupal|SHAREPOINT|per annum|ServiceNow|Test Lead|User Researcher|Service Management|\(PERM\)|£\d.K|Remedy|ITSM|Symfony|Zend|Full Time|Technical Business Analyst|BUSINESS ANALYST|AUTOMATION TESTER|FIELD TECHNICIAN|websphere administrator|Research Data Scientist) ` )
engineersRegex = regexp . MustCompile ( ` ((Support|Devops|Planning|security|Postgresql|network|sccm|test|data|imac|firewall|vmware)\s+Engineer) ` )
developersRegex = regexp . MustCompile ( ` ((Big Data|Java Server Side|Java|PHP|Graduate|Access|Oracle ADF|SHAREPOINT|Ruby on Rails|Java Software|IOS|Qlikview|c#|c\+\+|\.net|bi|go lang|Python)+\s+Developer+) ` )
// architectsRegex = regexp.MustCompile(`(Java|PHP|Microsoft)+(?:\s)(?=Architect)`)
)
func Grab ( url string ) [ ] RssItem {
log . Printf ( "Grabbing: %v" , url )
// url := "https://www.jobserve.com/MySearch/F3A56475D5FD4966.rss"
content , err := readFeed ( url )
if err != nil {
log . Printf ( "failed to poll feed <%s>: %v" , url , err )
// continue
}
items , err := extractItems ( content )
if err != nil {
log . Printf ( "Failed to extract items from feed %s: %v" , url , err )
// log.Printf("Failed to extract items from feed %s: %v", feed.url, err)
// continue
}
log . Printf ( "Length %v\n" , len ( items ) )
items = rejectItems ( items )
log . Printf ( "Length %v\n" , len ( items ) )
2024-04-29 15:03:42 +00:00
acceptItems ( items )
2024-04-26 16:13:07 +00:00
log . Printf ( "Length %v\n" , len ( items ) )
return items
}
func readFeed ( url string ) ( string , error ) {
resp , err := http . Get ( url )
if err != nil {
return "" , fmt . Errorf ( "failed to request feed: %v" , err )
}
defer resp . Body . Close ( )
body , err := io . ReadAll ( resp . Body )
if err != nil {
return "" , fmt . Errorf ( "failed to read feed: %v" , err )
}
return string ( body ) , err
}
func showStruct ( item any ) {
val := reflect . ValueOf ( item )
typ := val . Type ( )
for i := 0 ; i < val . NumField ( ) ; i ++ {
field := val . Field ( i )
fieldType := typ . Field ( i )
fmt . Printf ( "Field Name: %s, Field Value: %v\n" , fieldType . Name , field . Interface ( ) )
}
}
func extractItems ( content string ) ( [ ] RssItem , error ) {
var items [ ] RssItem
fp := gofeed . NewParser ( )
feed , err := fp . ParseString ( content )
if err != nil {
return items , fmt . Errorf ( "Failed to parse feed: %v" , err )
}
for _ , item := range feed . Items {
var ri RssItem
ri . Title = item . Title
if item . PublishedParsed != nil {
ri . Date = * item . PublishedParsed
}
if item . GUID != "" {
// ri.Id = item.GUID
ri . Id = xhashes . SHA1 ( item . GUID )
}
if item . Description != "" {
ri . Summary = item . Description
}
ri . URL = item . Link
ri = additionalProcessing ( ri )
items = append ( items , ri )
}
return items , nil
}
func additionalProcessing ( workItem RssItem ) RssItem {
if loc := locationRegex . FindStringSubmatch ( workItem . Summary ) ; loc != nil {
workItem . Location = loc [ 1 ]
}
if rate := rateRegex . FindStringSubmatch ( workItem . Summary ) ; rate != nil {
workItem . Salary = rate [ 1 ]
}
if company := companyRegex . FindStringSubmatch ( workItem . Summary ) ; company != nil {
workItem . Company = company [ 1 ]
}
return workItem
}
func acceptItems ( jobitems [ ] RssItem ) [ ] RssItem {
var items [ ] RssItem
for _ , item := range jobitems {
var accept [ ] string
acceptable := false
if pat := acceptRegex . FindStringSubmatch ( item . Title ) ; pat != nil {
accept = append ( accept , pat [ 0 ] )
acceptable = true
}
if pat := acceptRegex . FindStringSubmatch ( item . Summary ) ; pat != nil {
accept = append ( accept , pat [ 0 ] )
acceptable = true
}
log . Printf ( "%v :: Accept? %v -- %v" , item . Title , acceptable , accept )
if acceptable == true {
items = append ( items , item )
}
}
return items
}
func rejectItems ( jobitems [ ] RssItem ) [ ] RssItem {
var items [ ] RssItem
for _ , item := range jobitems {
var rejected [ ] string
rejectable := false
if pat := pattRegex . FindStringSubmatch ( item . Title ) ; pat != nil {
rejected = append ( rejected , pat [ 0 ] )
rejectable = true
}
if pat := pattRegex . FindStringSubmatch ( item . Summary ) ; pat != nil {
rejected = append ( rejected , pat [ 0 ] )
rejectable = true
}
if pat := engineersRegex . FindStringSubmatch ( item . Title ) ; pat != nil {
rejected = append ( rejected , pat [ 0 ] )
rejectable = true
}
if pat := engineersRegex . FindStringSubmatch ( item . Summary ) ; pat != nil {
rejected = append ( rejected , pat [ 0 ] )
rejectable = true
}
if pat := developersRegex . FindStringSubmatch ( item . Title ) ; pat != nil {
rejected = append ( rejected , pat [ 0 ] )
rejectable = true
}
if pat := developersRegex . FindStringSubmatch ( item . Summary ) ; pat != nil {
rejected = append ( rejected , pat [ 0 ] )
rejectable = true
}
log . Printf ( "%v :: Reject? %v -- %v" , item . Title , rejectable , rejected )
if rejectable == false {
items = append ( items , item )
}
}
return items
}