-
Notifications
You must be signed in to change notification settings - Fork 0
/
d2fa.ml
368 lines (319 loc) · 12.9 KB
/
d2fa.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
open Batteries
open Printf
open Regex_dfa
open Ean_std
type deferment = Root | To of int | Unknown
type 'label defer_label = {
d_label : 'label; (* the original label for the state *)
mutable defer : deferment
(* where q defers to - None if not deferred, Some -1 if not yet processed *)
}
and ('label, 'map, 'dec) defer_state = ('label defer_label, 'map, 'dec) state
type ('label, 'dec) d2fa = ('label, int IMap.t, 'dec) defer_state fa
type ('a, 'b) t = ('a, 'b) d2fa
let is_root q = q.label.defer = Root
module EdgeSet = Set.Make(struct
type t = int * int * int
let compare (a,b,w1) (c,d,w2) =
if w1 = w2
then compare (a,b) (c,d)
else w2 - w1
end)
let print_es oc es =
EdgeSet.print ~first:"{" ~sep:"" ~last:"}"
(fun oc (i,j,w) -> Printf.fprintf oc "(%d, %d, %d)" i j w) oc es
let print_table t =
IMap.iter_range (fun i j q -> printf "(%d,%d)->%d " i j q) t; print_newline()
(* let edges = Value.observe_int_ref "d2fa candidate edges" (ref 0) *)
(* build the table of commonalities between each pair of states *)
let build_srg ?(min_weight = 10) dfa =
let n = Array.length dfa.qs in
let es = ref EdgeSet.empty in
let counts = Array.create (max_commonality+1) 0 in
for i = 0 to n-1 do
(* printf "%d: " i; *)
for j = i+1 to n-1 do
let w = commonality dfa.qs.(i) dfa.qs.(j) in
counts.(w) <- counts.(w) + 1;
(* Printf.printf "I:"; print_table dfa.qs.(i).map;
printf "J:"; print_table dfa.qs.(j).map;
printf "--------------\n"; *)
if w > min_weight then begin
(* printf "%d,%d " j w; *)
es := EdgeSet.add (i,j,w) !es;
end
done;
(* printf "\n"; *)
done;
(* eprintf "#Commonalities: ";
Array.iteri (fun i c -> if c > 0 then eprintf "%d of %d," c i) counts;
eprintf "\n"; *)
!es
let build_srg d = wrap build_srg "D2fa.build_srg" d
let kruskal_roots n es loopers =
let cl = Array.init n (fun i -> [i], loopers.(i)) in
let find i = Array.findi (fun (lst,_) -> List.mem i lst) cl in (* optimize? *)
let union i j =
if snd cl.(i) then (* i has self-loop *) (
cl.(i) <- (fst cl.(i) @ fst cl.(j)), (snd cl.(i) || snd cl.(j));
cl.(j) <- [], false;
) else ( (* j might have self-loop *)
cl.(i) <- (fst cl.(j) @ fst cl.(i)), (snd cl.(i) || snd cl.(j));
cl.(j) <- [], false;
)
in
let try_join ((i, j, _) as e) t =
(* Printf.printf "Kruskal - Joining %d and %d..." i j; *)
let ipos = find i and jpos = find j in
if ipos = jpos || (snd cl.(ipos) && snd cl.(jpos)) then
(* can't make a loop or join two trees with self-looping states *)
((*print_string "failed\n";*) t)
else ((*print_string "succeeded\n";*) union ipos jpos; EdgeSet.add e t) in
let forest = EdgeSet.fold try_join es EdgeSet.empty in
(* Array.print ~first:"Cl: [" ~last:"]\n" (List.print Int.print) stdout cl; *)
let trees = Array.filter_map (function ([],_) | ([_],_) -> None | (lst,_) -> Some lst) cl in
(*
let print_idx oc i =
if loopers.(i) then IO.write oc '(';
Int.print oc i;
if loopers.(i) then IO.write oc ')';
in
Array.print ~first:"Clusters: [" ~last:"]\n" (List.print print_idx) stdout trees;
*)
forest, trees
let loop_count dfa i = IMap.fold_range (fun lo hi dec acc -> if dec = i then acc + (hi-lo+1) else acc) dfa.qs.(i).map 0
let is_self_loop limit dfa i =
loop_count dfa i > limit
(* |> tap (fun x -> if x then printf "Self-loop found on: %d\n" i) *)
let build_sf ?(self_loop_limit=128) dfa = (* debug wrapper *)
let es = build_srg dfa in
(* print_string "High-weight Edge Set: "; print_es es; print_newline (); *)
let n = Array.length dfa.qs in
let loops = Array.init n (is_self_loop self_loop_limit dfa) in
kruskal_roots n es loops
(*
let build_sf dfa =
let spanning_forest, clusters = build_sf dfa in
eprintf "#Spanning Forest: %a\n" print_es spanning_forest;
spanning_forest, clusters
*)
let build_sf ?self_loop_limit d = wrap (build_sf ?self_loop_limit) "D2fa.build_sf" d
module MPM = MultiPMap
(* build edge_distances and deferment-edge lookup through multi-set *)
let defs_of_sf n sf =
let add_to_mpm (i,j,_) acc = MPM.add i j (MPM.add j i acc) in
let deferments = EdgeSet.fold add_to_mpm sf
(MPM.create Int.compare Int.compare) in
let distances = Array.make_matrix n n max_int in
let set_dist (i,j,w) = distances.(i).(j) <- w; distances.(j).(i) <- w in
EdgeSet.iter set_dist sf;
distances, deferments
(* Returns the max distance in [ds] metric from [node] to any of [nodes] *)
let total_dist ds nodes node =
(* printf "Total dist: %dx%d ds, nodes: %a, node: %d%!\n" (Array.length ds) (Array.length ds.(0)) (List.print Int.print) nodes node; *)
let nodes = Array.of_list nodes in
let dist i j = ds.(nodes.(i)).(nodes.(j)) in
let dist_a = Array.map (fun j -> ds.(node).(j)) nodes in
let rec dij ns =
if ns = [] then () else
let min_dist_i = minarg (fun i -> dist_a.(i)) (List.enum ns) in
let update_dist j dj =
let dm = dist_a.(min_dist_i) + dist min_dist_i j in
if dm > 0 && dm < dj then dist_a.(j) <- dm
in
Array.iteri update_dist dist_a;
dij (List.remove ns min_dist_i)
in
dij (Array.range nodes |> List.of_enum);
Array.reduce max dist_a (* TODO: INVESTIGATE (+) AS METRIC *)
let total_degree ds nodes node =
List.enum nodes
|> map (fun i -> if ds.(i).(node) = max_int then 0 else ds.(i).(node))
|> Enum.sum
(* |> tap (printf "TD of %d among %a: %d\n" node (List.print Int.print) nodes) *)
let degree ds x = Array.enum ds.(x) |> filter ((>) 0) |> Enum.count
(* get the center of all the nodes *)
let get_center ds = function
[] | [_] -> assert false (* no empty or single-node clusters *)
| [x;_] -> x (* either node is the center of a two-node tree *)
| nodes -> maxarg (total_degree ds nodes) (List.enum nodes)
(*
let get_root vs = maxarg loop_count (List.enum vs)
*)
let reduce_transitions defs qs center =
let reduce d1 d2 = match d1, d2 with None, None -> assert false
| None, Some _ -> Some (-1)
| Some x1, Some x2 when x1 = x2 -> None
| Some x, _ -> Some x
in
let defer_to parent child =
(* printf "DT P%d C%d.." parent child; *)
match qs.(child) with
| {label={defer = Unknown}} as cq ->
(* reduce the child's transition map *)
let new_map = IMap.union reduce cq.map qs.(parent).map in
let label = {cq.label with defer=To parent} in
qs.(child) <- {cq with map=new_map; label=label };
(* printf "ok\n"; *)
true
| _ -> (* printf "loop\n"; *) false (* anything else is a loop *)
in
let rec defer p =
MPM.find p defs
|> PSet.iter (fun c -> if defer_to p c then defer c )
in
(* Printf.printf "Deferring from center: %d\n" center; *)
qs.(center).label.defer <- Root;
defer center
let of_dfa ?self_loop_limit dfa =
let (forest, trees) = build_sf ?self_loop_limit dfa in
(* Array.print ~first:"#Trees: [" ~last:"]\n" (List.print Int.print) stdout trees
;
printf "Forest: \n%a\n" print_es forest; *)
let n = Array.length dfa.qs in
let (_dists, defs) = wrap (defs_of_sf n) "Defs_of_sf" forest in
let centers = wrap (Array.map (get_center _dists)) "Get_center" trees in
(* let centers = wrap (Array.map List.hd) "Get_root" trees in *)
Array.print ~first:"#Centers: [" ~last:"]\n" Int.print stdout centers;
let qs = Array.map (fun i -> {i with label={d_label = i.label; defer=Unknown}} ) dfa.qs in
Array.iter (reduce_transitions defs qs) centers;
{qs = qs; q0 = qs.(dfa.q0.id)}
let merge_maps parent child = IMap.fold_range IMap.add_range child parent
let undefer d2fa q =
let rec get_full_map q = match q.label.defer with
Unknown | Root -> q.map
| To q_parent ->
let par_map = get_full_map d2fa.qs.(q_parent) in
merge_maps par_map q.map
in
{q with label = {q.label with defer = Root}; map=get_full_map q}
let to_dfa_q d q =
let q_new = undefer d q in {q_new with label = q.label.d_label}
let to_dfa d2fa = map_qs (to_dfa_q d2fa) d2fa
let qmap_size qmap =
IMap.fold_range (fun lo hi _ acc -> hi - lo + 1 + acc) qmap 0
(* compression ratio - smaller is better *)
let compression d2fa =
let total = Array.length d2fa.qs * Regex_dfa.num_chars |> float in
let needed = Array.enum d2fa.qs |> map (fun q -> qmap_size q.map) |> Enum.sum |> float in
needed /. total
(* LISP FORMAT
# comment line
<no of states>
(defptr (char1 state1 char2 state2 ...) <accepted rule numbers>)
...
*)
let print_lisp oc d2fa =
fprintf oc "# Made by MSU's regex_dfa\n%d\n" (Array.length d2fa.qs);
let print_tmap oc v = IMap.iter (fun i qi -> fprintf oc "%d %d " i qi) v in
let print_q oc {label={defer=def};map=map;dec=dec;id=id} =
(match def with Unknown -> fprintf oc "( -1 ( "
| Root -> fprintf oc "( %d ( " id
| To q_def -> fprintf oc "( %d ( " q_def);
print_tmap oc map; IO.nwrite oc ") ";
if dec <> [] then List.print Int.print oc dec; IO.nwrite oc " )";
in
print_fa ~ids:false print_q oc d2fa
let print_list_dec oc dec = if dec <> [] then List.print Int.print oc dec
let print_q_gen ~print_m ~print_dec oc i {label={d_label=lab; defer=def};map=map;dec=dec} =
let id = "N" in
(match def with Unknown | Root -> () | To q_def -> fprintf oc "%s%d -> %s%d [style=dotted];\n" id i id q_def);
fprintf oc "%s%d [label=\"%d %a\"]; //%a\n" id i i print_dec dec Unit.print lab;
let trans =
IMap.fold_range (fun lo hi q acc -> MultiPMap.add q (lo,hi) acc)
map MultiPMap.empty in
MultiPMap.iter (fun q lhset ->
fprintf oc "%s%d -> %s%a [label=\"" id i id print_m q;
PSet.print ~first:"" ~last:"" ~sep:" " print_rng oc lhset;
fprintf oc "\"];") trans;
fprintf oc "\n"
let print_dot_gen ~print_m ~print_dec ~id oc d2fa =
fprintf oc "digraph %s {\n" id;
Array.iteri (print_q_gen ~print_m ~print_dec oc) d2fa.qs;
fprintf oc "}\n"
let print_dot ~id oc d2fa = print_dot_gen ~print_m:Int.print ~print_dec:print_list_dec ~id oc d2fa
let print _ = print_dot
let print_stats oc d2fa =
let size_counts = Array.create (num_chars+1) 0 in
Array.iter (fun q -> let i = qmap_size q.map in size_counts.(i) <- size_counts.(i) + 1) d2fa.qs;
Array.print ~first:"Transition range Counts: [" ~last:"]\n" Int.print oc size_counts;
fprintf oc "Total transition ranges: %d\n" (Array.reduce (+) size_counts);
()
let run d2fa chars =
let state_count = Array.length d2fa.qs in
let next_q (q,_) c =
let q = d2fa.qs.(q) and c = Char.code c in
let rec loop q n =
if n > state_count then failwith (sprintf "deferment loop detected - q%d, c%d" q.id c);
try (IMap.find c q.map, n)
with Not_found ->
match (q.label.defer) with
Root | Unknown -> loop d2fa.q0 (n+1)
| To q_def -> loop d2fa.qs.(q_def) (n+1)
in
loop q 0
in
scanl next_q (d2fa.q0.id,0) chars
let print_trace dfa oc trace =
let last = Enum.reduce (fun _ y -> y) (Enum.clone trace) in
Enum.print Int.print oc trace;
printf ": ";
List.print Int.print oc dfa.qs.(last).dec;
IO.write oc '\n'
let print_accept dfa oc trace =
trace /@ (fun q -> dfa.qs.(q).dec)
// (fun x -> x <> [])
|> Enum.print ~last:"\n" (List.print Int.print) oc
let add_pair (a,b) (c,d) = a+c, b+d
let summarize oc dfa =
let tr_count, range_count =
Array.enum dfa.qs |>
map (fun q -> IMap.enum q.map |>
Enum.fold (fun (t,r) (i,j,_q) -> (t+j-i+1, r+1)) (0,0))
|> Enum.reduce add_pair in
let deferred, root =
Array.fold_left (fun (def, root) {label={defer=d}} ->
match d with
Unknown -> (def, root)
| To _ -> (def+1, root)
| Root -> (def,root+1))
(0,0) dfa.qs in
Printf.fprintf oc "D2FA %d states, %d transitions (%d ranges)\n" (Array.length dfa.qs) tr_count range_count;
(* let finals = Array.fold_left (fun a q -> if q.dec = [] then a else q.dec :: a) [] dfa.qs in
List.print ~last:"]\n" (List.print Int.print) oc finals; *)
Printf.fprintf oc "D2FA: %d deferred, %d roots, %d not deferred\n%!" deferred root (Array.length dfa.qs - deferred - root);
()
let debug_d2fa = true
let run_stream d2fa chars =
let best_pri, ret =
match d2fa.q0.dec with
Some (pri,act,caq) -> ref pri, ref (act,caq,chars)
| None -> ref max_int, ref ([],None,chars) in
let rec get_next_id q c = (* handle deferments *)
try IMap.find (Char.code c) q.map
with Not_found ->
match q.label.defer with
| To q' -> if debug_d2fa then eprintf "D%d %!" q'; get_next_id d2fa.qs.(q') c
| Root | Unknown -> -1
in
let rec next_state q cs n =
match LazyList.get cs with
| None -> !ret,n (* return here *)
| Some ((_,c),cs') ->
let q_next_id = get_next_id q c in
if debug_d2fa then
eprintf "%s:%d->%d %!" (Char.escaped c) q.id q_next_id;
if q_next_id = -1 then
!ret,n (* return here *)
else if q_next_id >= Array.length d2fa.qs then
failwith "referenced state %d invalid"
else
let q_next = d2fa.qs.(q_next_id) in
(match q_next.dec with
| Some (pri,act,caq) when pri <= !best_pri ->
best_pri := pri;
ret := act,caq,cs'
| _ -> () );
next_state q_next cs' (n+1)
in
next_state d2fa.q0 chars 1