-
Notifications
You must be signed in to change notification settings - Fork 1
/
script.py
101 lines (82 loc) · 3.82 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# script.py
from os import environ
from os.path import join, dirname
from dotenv import load_dotenv
import re
import pandas
from TwitterAPI import TwitterAPI, TwitterPager
# create .env file path
try:
# this will fail if running interactively which will source
# the script from current directory
dotenv_path = join(dirname(__file__), '.env')
except:
dotenv_path = '.env'
# load file from the path
load_dotenv(dotenv_path)
if __name__ == "__main__":
# connect to api
api = TwitterAPI(consumer_key=environ['TWITTER_CONSUMER_KEY'],
consumer_secret=environ['TWITTER_CONSUMER_SECRET'],
access_token_key=environ['TWITTER_ACCESS_TOKEN'],
access_token_secret=environ['TWITTER_ACCESS_TOKEN_SECRET'])
# scrape all prior tweets to check which packages I've already tweeted
SCREEN_NAME = 'RLangPackage'
pager = TwitterPager(api,
'statuses/user_timeline',
{'screen_name': SCREEN_NAME, 'count': 100})
# parse out the package name that occurs before the hyphen at the beginning
previous_pks = []
for item in pager.get_iterator(wait=3.5):
if 'text' in item:
this_pkg = re.sub("^([A-Za-z0-9.]+) - (.*)", "\\1", item['text'])
previous_pks.append(this_pkg)
# add packrat, it wasn't formatted correctly when it tweeted
previous_pks.append('packrat')
# convert the package names to a dataframe
prev_df = pandas.DataFrame({'name': previous_pks})
prev_df.set_index('name')
# load the data I've compiled on R packages
url = "https://raw.githubusercontent.com/StevenMMortimer/one-r-package-a-day/d94392d7abb9a7ade71c75e77c4284ad6e350969/r-package-star-download-data.csv"
all_df = pandas.read_csv(url)
all_df.set_index('name')
# do an "anti join" to throw away previously tweeted rows
all_df = pandas.merge(all_df, prev_df, how='outer', indicator=True)
all_df = all_df[all_df['_merge'] == 'left_only']
# focus on packages in middle ground of downloads and stars
filtered_df = all_df[all_df['github_url'].notnull()]
filtered_df = filtered_df.loc[lambda df: df.stars.notnull() | df.stars < 1000]
filtered_df = filtered_df[filtered_df['downloads'].notnull()]
filtered_df = filtered_df.loc[lambda df: df.downloads < 1000000]
# randomly select one of the remaining rows
selected_pkg = filtered_df.sample(1)
# pull out the name and description to see if we need
# to truncate because of Twitter's 280 character limit
prepped_name = selected_pkg.iloc[0]['name']
prepped_desc = re.sub(r'\s+', ' ',
selected_pkg.iloc[0]['description']).strip()
# determine how many urls are in the description
# since Twitter shortens or expands all URLs to 23 chars
urls_count = len(re.findall("https|http|\bwww|<www", prepped_desc))
name_len = len(prepped_name)
desc_len = len(prepped_desc)
# determine the max length of the description
# 280 tweet char max
# then minus 3 for " - "
# then minus 9 for the " #rstats " hashtag
# then minus the number of urls plus one github url
# times 23 because all links are counted as 23 chars
max_len = (280 - 3 - ((urls_count + 1) * 23) - 9 - name_len)
# truncate the description to the max length if needed
if desc_len <= max_len:
prepped_desc = prepped_desc[0:desc_len]
else:
# minus extra 3 for the added "..."
prepped_desc = prepped_desc[0:(max_len - 3)] + "..."
# cobble together the tweet text
TWEET_TEXT = prepped_name + " - " + prepped_desc + \
" #rstats " + selected_pkg.iloc[0]['github_url']
print(TWEET_TEXT)
# tweet it out to the world!
r = api.request('statuses/update', {'status': TWEET_TEXT})
print('SUCCESS' if r.status_code == 200 else 'PROBLEM: ' + r.text)