forked from royopa/anbima_scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathima_carteiras.py
143 lines (113 loc) · 4.2 KB
/
ima_carteiras.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
import csv
import os
import time
from datetime import datetime, timedelta
import pandas as pd
import requests
from tqdm import tqdm
import utils
def get_ultima_data_disponivel_base(path_file_base):
# verifica a última data disponível na base
with open(path_file_base, 'r') as f:
for row in reversed(list(csv.reader(f))):
data = row[0].split(';')[0]
if data == 'dt_referencia':
return None
data = row[0].split(';')[0]
return datetime.strptime(data, '%Y-%m-%d').date()
def remove_old_files():
file_list = os.listdir(r"downloads")
for file_name in file_list:
if not file_name.endswith('.xls'):
continue
today = datetime.datetime.now().strftime('%d.%m.%Y')
data_arquivo = file_name.split('.xls')[-2][-10:]
if today != data_arquivo:
os.remove(os.path.join('downloads', file_name))
def download_file_carteira(url, dt_referencia, file_name, indice):
dt_referencia = dt_referencia.strftime('%d/%m/%Y')
params = {
'Titulo_1': 'indice',
'Indice': 'indice',
'Consulta_1': 'Ambos',
'Dt_Ref': dt_referencia,
'DataIni': dt_referencia,
'DataFim': dt_referencia,
'Consulta': 'Ambos',
'saida': 'csv',
'Idioma': 'PT'
}
response = requests.get(url, params=params, stream=True)
with open(file_name, "wb") as handle:
for data in tqdm(response.iter_content()):
handle.write(data)
handle.close()
def generate_csv_base(df, path_file_base):
# organizar o arquivo base por dt_referencia
df = pd.read_csv(path_file_base, sep=';')
df = df.sort_values('dt_referencia')
# set the index
df.set_index('dt_referencia', inplace=True)
df.to_csv(path_file_base, sep=';')
def generate_xlsx_base(df, path_saida):
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(path_saida, engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1')
# Close the Pandas Excel writer and output the Excel file.
writer.save()
def xrange(x):
return iter(range(x))
def datetime_range(start=None, end=None):
span = end - start
for i in xrange(span.days + 1):
yield start + timedelta(days=i)
def main():
# apaga arquivos antigos
remove_old_files()
# verifica a última data disponível na base
name_file_base = 'ima_quadro_resumo_base.csv'
path_file_base = os.path.join('bases', name_file_base)
# ultima data base dispon[ivel
ultima_data_base = get_ultima_data_disponivel_base(path_file_base)
print('Última data base disponível:', ultima_data_base)
if (ultima_data_base is None):
ultima_data_base = datetime.date(2010, 11, 17)
carteiras = [
'irf-m',
'irf-m 1',
'irf-m 1+',
'ima-b',
'ima-b 5',
'ima-b 5+',
'ima-c',
'ima-s',
'ima-geral',
'ima-geral ex-c'
]
# faz o download do csv no site da anbima
url = 'http://www.anbima.com.br/informacoes/ima/ima-carteira-down.asp'
# verifica a última data disponível na base
today = datetime.now().date()
cal = utils.get_calendar()
ultima_data_base = cal.offset(today, -5)
dates_range = list(utils.datetime_range(start=ultima_data_base, end=today))
for dt_referencia in reversed(dates_range):
for carteira in carteiras:
path_download = os.path.join('downloads', carteira)
if not os.path.exists(path_download):
os.makedirs(path_download)
file_name = os.path.join(
path_download,
dt_referencia.strftime('%Y%m%d') + '_' + carteira + '.csv'
)
if utils.check_download(dt_referencia, file_name) is False:
break
# faz o download do arquivo caso ele ainda não tiver sido baixado
if not os.path.exists(file_name):
download_file_carteira(url, dt_referencia, file_name, carteira)
print("Arquivos baixados com sucesso e importados para a base de dados")
if __name__ == '__main__':
main()