package grabber import ( "fmt" "github.com/shomali11/util/xhashes" "io" "log" "net/http" "reflect" "regexp" "github.com/mmcdole/gofeed" ) var ( Version string Build string locationRegex = regexp.MustCompile(`Location:<\/strong><\/td> <\/td>(.*?)(?:<\/td>)`) rateRegex = regexp.MustCompile(`Rate:<\/strong><\/td> <\/td>(.*?)<\/td>`) companyRegex = regexp.MustCompile(`Advertiser:<\/strong><\/td> <\/td>(.*?)<\/td>`) // accept filter acceptRegex = regexp.MustCompile(`(full\s?stack|front\s?end|html|html5|es6|react|angular|knockout|ember|vue|riotjs|css|javascript|typescript|golang|go|sql|node|backbone|git|gulp|jquery|express|£\dk|Data Warehouse Developer|iot|internet of things)\W`) // reject filters pattRegex = regexp.MustCompile(`(Simply Education|Splunk|Coordinators?|Teachers?|Technical Writers?|Data Analyst|WebLogic|WebSphere|Data Scientist|Change Managers?|T24|Test Analyst|Insight Analyst|application tester|senior tester|Salesforce|QlikView|Navision|Murex|seo|django|drupal|SHAREPOINT|per annum|ServiceNow|Test Lead|User Researcher|Service Management|\(PERM\)|£\d.K|Remedy|ITSM|Symfony|Zend|Full Time|Technical Business Analyst|BUSINESS ANALYST|AUTOMATION TESTER|FIELD TECHNICIAN|websphere administrator|Research Data Scientist)`) engineersRegex = regexp.MustCompile(`((Support|Devops|Planning|security|Postgresql|network|sccm|test|data|imac|firewall|vmware)\s+Engineer)`) developersRegex = regexp.MustCompile(`((Big Data|Java Server Side|Java|PHP|Graduate|Access|Oracle ADF|SHAREPOINT|Ruby on Rails|Java Software|IOS|Qlikview|c#|c\+\+|\.net|bi|go lang|Python)+\s+Developer+)`) // architectsRegex = regexp.MustCompile(`(Java|PHP|Microsoft)+(?:\s)(?=Architect)`) ) func Grab(url string) []RssItem { log.Printf("Grabbing: %v", url) // url := "https://www.jobserve.com/MySearch/F3A56475D5FD4966.rss" content, err := readFeed(url) if err != nil { log.Printf("failed to poll feed <%s>: %v", url, err) // continue } items, err := extractItems(content) if err != nil { log.Printf("Failed to extract items from feed %s: %v", url, err) // log.Printf("Failed to extract items from feed %s: %v", feed.url, err) // continue } log.Printf("Length %v\n", len(items)) items = rejectItems(items) log.Printf("Length %v\n", len(items)) items = acceptItems(items) log.Printf("Length %v\n", len(items)) return items } func readFeed(url string) (string, error) { resp, err := http.Get(url) if err != nil { return "", fmt.Errorf("failed to request feed: %v", err) } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { return "", fmt.Errorf("failed to read feed: %v", err) } return string(body), err } func showStruct(item any) { val := reflect.ValueOf(item) typ := val.Type() for i := 0; i < val.NumField(); i++ { field := val.Field(i) fieldType := typ.Field(i) fmt.Printf("Field Name: %s, Field Value: %v\n", fieldType.Name, field.Interface()) } } func extractItems(content string) ([]RssItem, error) { var items []RssItem fp := gofeed.NewParser() feed, err := fp.ParseString(content) if err != nil { return items, fmt.Errorf("Failed to parse feed: %v", err) } for _, item := range feed.Items { var ri RssItem ri.Title = item.Title if item.PublishedParsed != nil { ri.Date = *item.PublishedParsed } if item.GUID != "" { // ri.Id = item.GUID ri.Id = xhashes.SHA1(item.GUID) } if item.Description != "" { ri.Summary = item.Description } ri.URL = item.Link ri = additionalProcessing(ri) items = append(items, ri) } return items, nil } func additionalProcessing(workItem RssItem) RssItem { if loc := locationRegex.FindStringSubmatch(workItem.Summary); loc != nil { workItem.Location = loc[1] } if rate := rateRegex.FindStringSubmatch(workItem.Summary); rate != nil { workItem.Salary = rate[1] } if company := companyRegex.FindStringSubmatch(workItem.Summary); company != nil { workItem.Company = company[1] } return workItem } func acceptItems(jobitems []RssItem) []RssItem { var items []RssItem for _, item := range jobitems { var accept []string acceptable := false if pat := acceptRegex.FindStringSubmatch(item.Title); pat != nil { accept = append(accept, pat[0]) acceptable = true } if pat := acceptRegex.FindStringSubmatch(item.Summary); pat != nil { accept = append(accept, pat[0]) acceptable = true } log.Printf("%v :: Accept? %v -- %v", item.Title, acceptable, accept) if acceptable == true { items = append(items, item) } } return items } func rejectItems(jobitems []RssItem) []RssItem { var items []RssItem for _, item := range jobitems { var rejected []string rejectable := false if pat := pattRegex.FindStringSubmatch(item.Title); pat != nil { rejected = append(rejected, pat[0]) rejectable = true } if pat := pattRegex.FindStringSubmatch(item.Summary); pat != nil { rejected = append(rejected, pat[0]) rejectable = true } if pat := engineersRegex.FindStringSubmatch(item.Title); pat != nil { rejected = append(rejected, pat[0]) rejectable = true } if pat := engineersRegex.FindStringSubmatch(item.Summary); pat != nil { rejected = append(rejected, pat[0]) rejectable = true } if pat := developersRegex.FindStringSubmatch(item.Title); pat != nil { rejected = append(rejected, pat[0]) rejectable = true } if pat := developersRegex.FindStringSubmatch(item.Summary); pat != nil { rejected = append(rejected, pat[0]) rejectable = true } log.Printf("%v :: Reject? %v -- %v", item.Title, rejectable, rejected) if rejectable == false { items = append(items, item) } } return items }