-
Notifications
You must be signed in to change notification settings - Fork 0
/
example.py
134 lines (104 loc) · 3.54 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import collections
import math
from affirmative_sampling import affirmative_sampling
from affirmative_sampling.example import mobydick_tokens
print("""=====================================================
'Affirmative Sampling' by J. Lumbroso and C. Martínez
=====================================================
""")
# Example with Moby Dick (from the Gutenberg Project)
print(" Examples use Moby Dick (from the Gutenberg Project)")
tokens = mobydick_tokens[:]
N = len(tokens)
n = len(set(tokens))
k = 100
# print the size (N), the cardinality (n),
# and expected size of sample for the given k
print(f" N={N}, n={n}, k={k}, k*ln(n/k)={k*math.log(n/k)+k}")
# Compute a random sample with Affirmative Sampling
sample = affirmative_sampling(tokens=tokens, k=k)
# =======================================
# EXAMPLE 1: Number of tokens without 'e'
TOTAL_number_of_tokens_without_e = len([
token
for token in set(tokens)
if "e" not in token
])
SAMPLE_number_of_tokens_without_e = len([
token
for token in sample["sample"].keys()
if "e" not in token
])
ESTIMATED_PROPORTION_number_of_tokens_without_e = (
SAMPLE_number_of_tokens_without_e / sample["sampleSize"] * 100.0
)
ESTIMATED_number_of_tokens_without_e = (
SAMPLE_number_of_tokens_without_e / sample["sampleSize"]
* sample["cardinalityEstimate"]
)
print("""
EXAMPLE 1: Number of tokens without 'e'
====================================
- Exact count: {}
- Estimated count: {}
- Error: {}%
- Size of sample: {}
- Expected size of sample: {}
- Tokens in sample without 'e': {}
- Proportion of tokens in sample without 'e': {}%
""".format(
TOTAL_number_of_tokens_without_e,
round(ESTIMATED_number_of_tokens_without_e, 2),
abs(round((ESTIMATED_number_of_tokens_without_e /
TOTAL_number_of_tokens_without_e-1.0)*100.0, 2)),
len(sample["sample"]),
round(k*math.log(n/k)+k, 2),
SAMPLE_number_of_tokens_without_e,
round(ESTIMATED_PROPORTION_number_of_tokens_without_e, 2),
))
# =======================================================================================
# EXAMPLE 2: Number of mice (rare elements, i.e., elements with freq. less or equal to 5)
MICE_THRESHOLD = 5
TOTAL_number_of_distinct_mice = len([
token
for (token, count) in collections.Counter(mobydick_tokens).items()
if count <= 5
])
SAMPLE_number_of_distinct_mice = len([
token
for (token, count) in sample["sample"].items()
if count <= 5
])
ESTIMATED_PROPORTION_number_of_distinct_mice = (
SAMPLE_number_of_distinct_mice / sample["sampleSize"] * 100.0
)
ESTIMATED_number_of_distinct_mice = (
SAMPLE_number_of_distinct_mice / sample["sampleSize"]
* sample["cardinalityEstimate"]
)
print("""
EXAMPLE 2: Number of mice (freq. less or equal to {})
====================================================
- Exact count: {}
- Estimated count: {}
- Error: {}%
- Size of sample: {}
- Expected size of sample: {}
- Number of mice in sample: {}
- Proportion of mice in sample: {}%
""".format(
MICE_THRESHOLD,
TOTAL_number_of_distinct_mice,
round(ESTIMATED_number_of_distinct_mice, 2),
abs(round((ESTIMATED_number_of_distinct_mice /
TOTAL_number_of_distinct_mice-1.0)*100.0, 2)),
len(sample["sample"]),
round(k*math.log(n/k)+k, 2),
SAMPLE_number_of_distinct_mice,
round(ESTIMATED_PROPORTION_number_of_distinct_mice, 2),
))
# Outputting the full sample
print("SAMPLE")
print("======")
for (token, count) in sorted(sample["sample"].items(), key=lambda pair: (-pair[1], pair[0])):
print(f"{count:>5} {token} ")