forked from merwin-asm/OpenCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.py
123 lines (78 loc) · 2.75 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""
Open Crawler v 1.0.0 | search.py
-- Note the official search functions doesnt count the clicks or learn from search patterns etc :]
"""
from mongo_db import connect_db, _DB
from rich import print
import time
import json
import sys
import os
import re
def mongodb():
# Config File
config_file = "config.json"
# Load configs from config_file - > json
try:
config_file = open(config_file, "r")
configs = json.loads(config_file.read())
config_file.close()
except:
try:
os.system("python3 config.py") # Re-configures
except:
os.system("python config.py") # Re-configures
config_file = open(config_file, "r")
configs = json.loads(config_file.read())
config_file.close()
## Setting Up Configs
MONGODB_PWD = configs["MONGODB_PWD"]
MONGODB_URI = configs["MONGODB_URI"]
# Initializes MongoDB
connect_db(MONGODB_URI, MONGODB_PWD)
mongodb() # Connects to DB
# Get the search
search = sys.argv[1:]
RESULTS = {} # Collects the results
if len(search) > 1:
t_1 = time.time()
for e in search:
url = list(_DB().Crawledsites.find({"$or" : [
{"recc": {"$regex": re.compile(e, re.IGNORECASE)}},
{"keys": {"$regex": re.compile(e, re.IGNORECASE)}},
{"desc": {"$regex": re.compile(e, re.IGNORECASE)}},
{"website" : {"$regex": re.compile(e, re.IGNORECASE)}}
]}))
res = []
[res.append(x["website"]) for x in url if x["website"] not in res]
del url
for url in res:
if url in RESULTS.keys():
RESULTS[url] += 1
else:
RESULTS.setdefault(url, 1)
t_2 = time.time()
RESULTS_ = RESULTS
RESULTS = sorted(RESULTS.items(), key=lambda x:x[1], reverse=True)
c = 0
for result in RESULTS:
if RESULTS_[result[0]] > 1:
print(f"[green]Link: {result[0]} | Common words: {result[1]} [/green]")
c += 1
print(f"[dark_orange]Query : {search} | Total Results : {c} | Time Taken : {t_2 - t_1}s[/dark_orange]")
else:
t_1 = time.time()
e = search[0]
url = list(_DB().Crawledsites.find({"$or" : [
{"recc": {"$regex": re.compile(e, re.IGNORECASE)}},
{"keys": {"$regex": re.compile(e, re.IGNORECASE)}},
{"desc": {"$regex": re.compile(e, re.IGNORECASE)}},
{"website" : {"$regex": re.compile(e, re.IGNORECASE)}}
]}))
t_2 = time.time()
res = []
[res.append(x["website"]) for x in url if x["website"] not in res]
del url
for result in res:
print(f"[green]Link: {result}[/green]")
print(f"[dark_orange]Query : {search} | Total Results : {len(res)} | Time Taken : {t_2 - t_1}s[/dark_orange]")