-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathDSJobSkill.R
106 lines (81 loc) · 3.41 KB
/
DSJobSkill.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
library(rvest)
library(stringr)
library(dplyr)
library(ggplot2)
clean.text <- function(text)
{
str_replace_all(text, regex('\r\n|\n|\t|\r|,|/|<|>|\\.'), ' ')
}
# Given running total dataframe and links to scrape skills and compute running total
ScrapeJobLinks <- function(res, job.links){
for(i in 1:length(job.links)){
job.url <- paste0(BASE_URL,job.links[i])
Sys.sleep(1)
cat(paste0('Reading job ', i, '\n'))
tryCatch({
html <- read_html(job.url)
text <- html_text(html)
text <- clean.text(text)
df <- data.frame(skill = KEYWORDS, count = ifelse(str_detect(text, KEYWORDS), 1, 0))
res$running$count <- res$running$count + df$count
res$num_jobs <- res$num_jobs + 1
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
}
return(res)
}
KEYWORDS <- c('Hadoop','Python','\\bSQL\\b', 'NoSQL','\\bR\\b', 'Spark', 'SAS', 'Excel', 'AWS', 'Azure', 'Java', 'Tableau')
KEYWORDS_DISPLAY <- c('Hadoop','Python','SQL', 'NoSQL','R', 'Spark', 'SAS', 'Excel', 'AWS', 'Azure', 'Java', 'Tableau')
# Indeed Search Words
# job_title <- "\"Data+Analyst\""
job_title <- "\"Data+Scientist\""
# job_title <- "\"Trader\""
location <- 'Chicago%2C+IL'
# use advanced search to get 50 results in a page
BASE_URL <- 'https://www.indeed.com'
ADV_URL <- paste0('https://www.indeed.com/jobs?as_and=&as_not=&as_cmp=&jt=all&st=&salary=&sr=directhire&radius=25&fromage=any&limit=50&sort=date&psf=advsrch&as_any=&as_phr=&as_ttl=', job_title, '&l=', location)
cat(ADV_URL)
# Scrape the search result
start_page <- read_html(ADV_URL)
job_count <- unlist(strsplit(start_page %>%
html_node("#searchCount") %>%
html_text(), split = ' '))
job_count <- as.numeric(str_replace_all(job_count[length(job_count) - 1], pattern = ',', replacement = ''))
cat('Total job count: ', job_count)
# Get start page job URLs
links <- start_page %>%
html_nodes("h2 a") %>%
html_attr('href')
# Get result page links
page.links <- start_page %>%
html_nodes(xpath = '//div[contains(@class,"pagination")]//a') %>%
html_attr('href')
# Create running total dataframe
running <- data.frame(skill = KEYWORDS_DISPLAY, count = rep(0, length(KEYWORDS_DISPLAY)))
# Since the indeed only display max of 20 pages from search result, we cannot use job_count but need to track by creating a num_jobs
num_jobs <- 0
# Here is our results object that contains the two stats
results <- list("running" = running, "num_jobs" = num_jobs)
if(job_count != 0){
cat('Scraping jobs in Start Page\n')
results <- ScrapeJobLinks(results, links)
}
for(p in 1:length(page.links)-1){
cat('Moving to Next 50 jobs\n')
# Navigate to next page
new.page <- read_html(paste0(BASE_URL, page.links[p]))
# Get new page job URLs
links <- new.page %>%
html_nodes("h2 a") %>%
html_attr('href')
# Scrap job links
results <- ScrapeJobLinks(results, links)
}
# running total
print(arrange(results$running, -count))
# running total count as percentage
results$running$count<-results$running$count/results$num_jobs
jt <- str_replace_all(job_title, '\\+|\\\"', ' ')
loc <- str_replace_all(location, '\\%2C+|\\+',' ')
p <- ggplot(results$running, aes(reorder(skill,-count), count)) + geom_bar(stat="identity") +
labs(x = 'Skill', y = 'Occurrences (%)', title = paste0('Skill occurrences(%) for ', jt, ' in ', loc))
p + scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))