-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataMiner.py
130 lines (106 loc) · 3.95 KB
/
DataMiner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# DataMiner.py -- scans old listentothis posts to create a nice testbase
import praw
import Actions
#accepted domains
domains = ["youtube.com", "archive.org", "bandcamp.com", "grooveshark.com", "npr.org", "radd.it", "soundcloud.com",
"spotify.com", "youtu.be", "vimeo.com"]
#limits on different groups
POSTS_FROM_URL = 10
POST_LIMIT = 10000
def process_domain(post):
#check that the domain supplied is good
domain_match = [domain for domain in domains if domain in post.url]
if len(domain_match) >= 1:
post.domain = domain_match[0]
return True
return False
def process_comments(post):
raddit_comments = [process_raddit_comment(comment.body) for comment in post.comments if comment.author and comment.author.name == "raddit-bot"]
if not len(raddit_comments):
return None
return raddit_comments[0]
def make_post(sub, post, comment):
try:
#make post and comment
post = Actions.make_post_url(sub, post.title, post.url)
Actions.make_comment(post, comment)
except Exception, e:
print str(e)
artist_match = "**name**|[**"
track_match = "**track**|**"
def process_raddit_comment(body):
info = ""
try:
#get artist
index = body.index(artist_match) + len(artist_match)
info += 'artist: ' + body[index:body.index('**', index)] + ' \n'
#get track
index = body.index(track_match) + len(track_match)
info += 'track: ' + body[index:body.index('**', index)] + ' '
except ValueError:
pass
return info
def main():
from CredentialsImport import CRImport
import utilitymethods as u
import logging
testbed = None
saved_posts = {}
# mark true if the script should remove all old posts from the testbed
DELETE_OLD = True
#import credentials
credentials = CRImport("TestCredentials.cred")
#create my reddit
r = u.create_multiprocess_praw(credentials)
#get testbed sub
testbed = u.get_subreddit(credentials, r)
#create datasource object
datasource = u.get_subreddit(credentials, r, credentials["DATAMININGSUB"])
if DELETE_OLD:
old_stream = praw.helpers.submission_stream(r, testbed, limit=POST_LIMIT)
results = []
try:
#delete all old posts
for post in old_stream:
try:
Actions.remove_post(post, delete=True)
print("deleted old post: %s..." % post.title[:20])
except AttributeError:
# Post or Comment may have been deleted between retrieving it
# and accessing its fields
pass
except AssertionError, e:
logging.log(logging.DEBUG, str(e) + "\nNo Posts!")
for domain in domains:
saved_posts[domain] = 0
#now go through datasource subreddit
try:
#create stream
stream = datasource.get_new(limit=POST_LIMIT)
count = 0
for post in stream:
if not len(domains):
break
count += 1
print "processing post " + str(count)
#process posts
results = []
if not process_domain(post):
continue
print "valid domain..."
#load comments
print "scanning comments..."
#process comments
comment = process_comments(post)
if comment:
#check domain
if saved_posts[post.domain] < POSTS_FROM_URL:
saved_posts[post.domain] += 1
print "X-Posting " + post.title[:20] + "... " + post.domain + "[" + str(saved_posts[post.domain]) + "/" + str(POSTS_FROM_URL) + "]"
make_post(testbed, post, comment)
elif saved_posts[post.domain] >= POSTS_FROM_URL and post.domain in domains:
domains.remove(post.domain)
except Exception, e:
print str(e)
if (__name__ == "__main__"):
main()