-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_dividends_future.py
153 lines (120 loc) · 4.59 KB
/
scrape_dividends_future.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
# from selenium.webdriver.common.keys import Keys
import time
import lxml
import requests
import re
# driver = webdriver.Firefox()
driver = None
week_selector = None
upcoming_dividends_dates_url = "https://www.marketbeat.com/dividends/ex-dividend-date-list/"
### Function that returns a DataFrame of dividend dates from a given url
def get_upcoming_dividend_dates(html=None) -> pd.DataFrame:
# get the html from marketbeat
html = html if html else requests.get(upcoming_dividends_dates_url).content
html = split_ticker_and_company(html)
# convert the html to an easily manageable DataFrame
upcoming_dividends_df = pd.read_html(html, parse_dates=True)[0]
upcoming_dividends_df = upcoming_dividends_df\
.loc[:,["Ticker", "Ex-Dividend Date", "Yield", "Period"]]
upcoming_dividends_df = upcoming_dividends_df[upcoming_dividends_df["Ex-Dividend Date"].str.len() < 100]
return upcoming_dividends_df
def get_all_dividends() -> pd.DataFrame:
"""
"""
# start browser
global driver, week_selector
if driver == None:
driver = webdriver.Firefox()
# navigate to page and identify the week dropdown
response = driver.get(upcoming_dividends_dates_url)
week_selector = Select(driver.find_element_by_id("cphPrimaryContent_ddlWeek"))
# close all popups
driver.execute_script("closeIframeModal()") # The big green square notification
# clickButton("onesignal-slidedown-cancel-button") # The notifications "cancel" button
# initialize the DataFrame
total_df = None
prev_entries = None
try:
# iterate over all dropdown options
dropdown_index = 0
while True:
print(f"Option {dropdown_index}")
# select the new dropdown option
week_selector = Select(driver.find_element_by_id("cphPrimaryContent_ddlWeek"))
week_selector.select_by_index(dropdown_index)
new_entries = get_upcoming_dividend_dates(html=driver.page_source)
# print(new_entries.loc[0, "Company"], prev_entries.loc[0, "Company"])
while prev_entries is not None and new_entries.loc[0, "Ticker"] == prev_entries.loc[0, "Ticker"]:
time.sleep(0.5)
new_entries = get_upcoming_dividend_dates(html=driver.page_source)
total_df = total_df.append(new_entries) if total_df is not None else new_entries
prev_entries = new_entries
print(f"{new_entries.shape[0]} new entries")
dropdown_index += 1
except Exception as e:
# print(f"Error: {e}")
driver.quit()
# no more dropdown options
total_df = total_df.dropna()
total_df['Period'] = total_df['Period'].replace('--', 'monthly')
return total_df
total_df["Ticker"] = total_df["Company"].apply(get_lazy_ticker_from_company_name)
return total_df
def clickButton(b_id, tries=0):
global driver
try:
driver.find_element_by_css_selector(f"button.{b_id}").click()
except Exception:
if(tries > 100):
raise Exception("Wifi is Terrible!")
time.sleep(0.5)
# print("clickDelay", b_id)
clickButton(b_id=b_id, tries=tries+1)
def split_ticker_and_company(html) -> str:
# create BeautifulSoup object
soup = BeautifulSoup(html, "lxml")
table = soup.find('table')
table_body = table.find('tbody')
# iterate over all rows in the table
rows = table_body.find_all('tr')
for row in rows:
# access the first column's entry
ticker_and_company = row.find('td')
# extract the ticker and company name, if applicable
try:
ticker = row.select_one('div .ticker-area').string or ''
company = row.select_one('div .title-area').string or ''
except:
ticker = company = ''
finally:
# create new cells to store this data
ticker_td = soup.new_tag('td')
ticker_td.string = ticker
company_td = soup.new_tag('td')
company_td.string = company
# append this data to the end of the table
row.append(ticker_td)
row.append(company_td)
# add ticker and company columns to the thead
table_head_row = table.select_one('thead tr')
ticker_th = soup.new_tag("th")
ticker_th.string = "Ticker"
table_head_row.append(ticker_th)
company_th = soup.new_tag("th")
company_th.string = "Company Name"
table_head_row.append(company_th)
return str(table)
# def get_lazy_ticker_from_company_name(agg_string):
# ticker, _ = re.findall(r"([A-Z]+)([A-Z].*)", agg_string)[0]
# if 1 <= len(ticker) <= 4:
# return ticker
# else:
# return None
if __name__ == "__main__":
upcoming_dividends_df = get_all_dividends()
upcoming_dividends_df.to_csv("all_entries_filtered.csv", index=False)