diff --git a/README.md b/README.md index cc7739a..d68d6cf 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,11 @@ Demonstration of Efficient Fuzzy Matching techniques using MongoDB This demonstrates the use of Metaphone, Garbled Search against a term index, Multifield and quorum searching. -unpack the datafile in datagen then run datagen.pl to generate 5 Million documents. +unpack the datafile in datagen then run `datagen.py` to generate 5 Million documents. -then run fuzzgo.py +then run `fuzzgo.py` -Needs pymongo and bottle +Needs `pymongo` and `bottle` This is my first python program - feel free to refactor all the code :-) diff --git a/datagen/datagen.py b/datagen/datagen.py index cad9324..7491d52 100644 --- a/datagen/datagen.py +++ b/datagen/datagen.py @@ -27,12 +27,12 @@ def damage_term(word): connection_string = "mongodb://localhost" connection = pymongo.MongoClient(connection_string)# -connection.drop_database('people'); +connection.drop_database('people') database = connection.people -database.nominals_v2.drop(); -database.nominals_v2_vocab.drop(); +database.nominals_v2.drop() +database.nominals_v2_vocab.drop() #Need this index up front database.nominals_v2_names.ensure_index([("p",pymongo.ASCENDING), ("_id",pymongo.ASCENDING)],unique=True) @@ -41,9 +41,9 @@ def damage_term(word): males = open('male.txt').read().splitlines() females = open('female.txt').read().splitlines() lasts = open('last.txt').read().splitlines() -postcodes = open("psmall.csv").read().splitlines(); -streets = open("streets.csv").read().splitlines(); -stdcodes = open("stdcodes.txt").read().splitlines(); +postcodes = open("psmall.csv").read().splitlines() +streets = open("streets.csv").read().splitlines() +stdcodes = open("stdcodes.txt").read().splitlines() for x in xrange(1,1000): @@ -54,7 +54,7 @@ def damage_term(word): lastname=None middleone=None middletwo=None - lastname = choice(lasts).partition(' ')[0] + lastname = choice(lasts).partition(' ')[0] lastname = damage_term(lastname) gender = random.randint(0,1) @@ -88,27 +88,23 @@ def damage_term(word): postcode = parts[0] lat = parts[1] lon = parts[2] - town = parts[13].partition(' ')[0].replace('"',''); - county = parts[6].replace(' County','').replace('"',''); + town = parts[13].partition(' ')[0].replace('"','') + county = parts[6].replace(' County','').replace('"','') #print postcode + "," + lat + "," + lon + "," + town + "," + county street = choice(streets) streetno = random.randint(1,200) - streettype = choice(["Rd.","Road","Ln.","Lane","Crescent","St.","Street"]); - - - + streettype = choice(["Rd.","Road","Ln.","Lane","Crescent","St.","Street"]) + stdcode = choice(stdcodes) - phoneshort = random.randint(200000,8900000); + phoneshort = random.randint(200000,8900000) phoneno = stdcode + str(phoneshort) - mobile = random.randint(000000000,999999999); + mobile = random.randint(000000000,999999999) mobileno = "07"+str(mobile) - - + address = str(streetno) + " " + street + " " + streettype + ", "+ town + ", " + county - metafirstname = soundslike(firstname) metalastname = soundslike(lastname) @@ -159,19 +155,15 @@ def damage_term(word): names.append({ "_id" : middletwo , "p" : [middletwo[0],middletwo[1]] }) - - - - database.nominals_v2.insert(records); + + database.nominals_v2.insert(records) try: database.nominals_v2_names.insert(names,continue_on_error=True) # This WILL fail a lot - thats the idea could upsert and count! except pymongo.errors.DuplicateKeyError: pass - - - + print x * 5000 database.nominals_v2.ensure_index([("firstname",pymongo.ASCENDING)]) @@ -189,8 +181,6 @@ def damage_term(word): database.nominals_v2.ensure_index([("allnames",pymongo.ASCENDING)]) database.nominals_v2.ensure_index([("allmetanames",pymongo.ASCENDING)]) - - #What if we want to extend it out! diff --git a/fuzzgo.py b/fuzzgo.py index 5fe9314..0fb88fd 100644 --- a/fuzzgo.py +++ b/fuzzgo.py @@ -6,7 +6,6 @@ ''' -from bson import Binary, Code from bson.json_util import dumps import json import re @@ -132,20 +131,20 @@ def search_simple(): for fieldname in fieldnames: if fieldname in queryfields: orvals.append({queryfields[fieldname]:queryterms[fieldname]}) - query["$or"] = orvals; + query["$or"] = orvals else: #Run an aggregation Query - then a query by ID for fieldname in fieldnames: if fieldname in queryfields: orvals.append({queryfields[fieldname]:queryterms[fieldname]}) - query["$or"] = orvals; + query["$or"] = orvals #Match on the OR of the fields - parallel - individual indexes #Project a score by adding one for each match #Convert each field to a aggregation boolean term {$eq:[a,b]} if anyfield != "true": - innermatches=[]; + innermatches=[] for fieldname in fieldnames: if fieldname in queryfields: #If queryterms[fieldname] is a $in clause we need a different model @@ -172,7 +171,7 @@ def search_simple(): else : - innermatches=[]; + innermatches=[] for fieldname in fieldnames: if fieldname in queryfields: #If queryterms[fieldname] is a $in clause we need a different model @@ -207,9 +206,9 @@ def search_simple(): print dumps(counts) if len(counts["result"]) <1: - query["$or"] = [{"_id":"xyzzyzzy"}]; #Quick hack for demo + query["$or"] = [{"_id":"xyzzyzzy"}] #Quick hack for demo else: - query["$or"] = counts["result"]; + query["$or"] = counts["result"] diff --git a/html/frontpage.html b/html/frontpage.html index 3092c71..b66dcbe 100644 --- a/html/frontpage.html +++ b/html/frontpage.html @@ -51,7 +51,7 @@