-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxco-sfilter.xqm
303 lines (284 loc) · 12.3 KB
/
xco-sfilter.xqm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
module namespace sf="http://www.parsqube.de/ns/xco/string-filter";
(:
: C o m p i l e s t r i n g f i l t e r
: ===========================================
:)
(:~
: Compiles a string filter into a structured representation.
: The representation is a <stringFilter> element with the following child elements:
: include: a set of regexes and/or strings or substrings to be matched
: exclude: a set of regexes and/or strings or substrings which must not be matched
:
: @param sfilter a string filter string
: @return a map representing the string filter
:)
declare function sf:compileStringFilter(
$sfilter as xs:string?)
as element(stringFilter)? {
let $itemsAndFlags := sf:splitStringIntoItemsAndFlags($sfilter, '#')
let $flags := $itemsAndFlags[1]
let $items := subsequence($itemsAndFlags, 2)
(: global flags :)
let $ignoreCase := not(contains($flags, 'c'))
let $patternIsRegex := contains($flags, 'r')
let $addAnchors := not(contains($flags, 'A'))
let $patterns := $items ! replace(., '^\s+|\s+$', '')[string()]
return if (empty($patterns)) then () else
let $patternsPlus := $patterns[not(starts-with(., '~'))]
let $patternsMinus := $patterns[starts-with(., '~')] ! substring(., 2)
return
if (empty(($patternsPlus, $patternsMinus))) then () else
<stringFilter text="{$sfilter}">{
if (empty($patternsPlus)) then () else
<include>{sf:compileSfPatternSet(
$patternsPlus, $ignoreCase, $patternIsRegex, $addAnchors)
}</include>,
if (empty($patternsMinus)) then () else
<exclude>{sf:compileSfPatternSet(
$patternsMinus, $ignoreCase, $patternIsRegex, $addAnchors)
}</exclude>
}</stringFilter>
};
(:~
: Translates a list of string filter patterns into a structured
: representation. A pattern is a glob pattern or a regular expression.
: The structured representation is a map.
:
: @param patterns a list of patterns
: @param ignoreCase if true, regex matching ignores case
: @param patternIsRegex if true, patterns are interpreted as regular
: expressions, otherwise as glob patterns
: @param addAnchors if true, by default glob patterns are translated
: into regular expressions with anchors indicating the begin and
: end of the string; the default can be overridden by flags 'a'
: (add anchors) and 'A' (do not add anchors).
: @return a map with possible entries 'empty', 'regexes', 'flags',
: 'strings', 'substrings'.
:)
declare function sf:compileSfPatternSet($patterns as xs:string*,
$ignoreCase as xs:boolean?,
$patternIsRegex as xs:boolean?,
$addAnchors as xs:boolean?)
as element()* {
let $literals :=
if ($patternIsRegex) then () else $patterns[not(matches(., '[@*?\\]'))]
let $literalsEff :=
if (not($ignoreCase)) then $literals else $literals ! lower-case(.)
(: each regex is described by a map with keys 'expr' and 'flags' :)
let $regexes :=
for $pattern in $patterns[not(. = $literals)]
let $regexAndFlags :=
sf:patternToRegexAndFlags($pattern, $ignoreCase, $patternIsRegex, $addAnchors)
return <regex expr="{$regexAndFlags[1]}" flags="{$regexAndFlags[2]}"/>
return (
<regexes>{$regexes}</regexes>,
if (empty($literals)) then () else
let $key := if ($addAnchors) then 'strings' else 'substrings'
return
element {$key} {
attribute ignoreCase {$ignoreCase},
$literalsEff ! <s>{.}</s>}
)
};
(:~
: Maps a pattern string to a regex string and a flags string.
: The pattern string may contain flags, separted from the
: pattern itself by a '@'.
:
: Supported flags:
: a/A - add anchors / do not add anchors
: r/R - pattern is regex / is not regex
: c/C - evaluate case-sensitively / not case-sensitively
:
: The parameters $ignoreCase, $patternIsRegex, $addAnchors
: provide default values for flags.
:
: The return value consists of the regex and optional regex flags,
: which are either the empty string or 'i' meaning case-insensitive
: evaluation.
:)
declare function sf:patternToRegexAndFlags(
$pattern as xs:string,
$ignoreCase as xs:boolean?,
$patternIsRegex as xs:boolean?,
$addAnchors as xs:boolean?)
as xs:string+ {
let $exprAndFlags :=
if (not(contains($pattern, '@'))) then $pattern (: no flags :)
else
let $exprAndFlags := sf:splitStringAtDoubleEscapableChar($pattern, '@')
let $expr := $exprAndFlags[1]
let $flags := $exprAndFlags[2]
return ($expr, $flags)
let $expr := $exprAndFlags[1]
let $flags := $exprAndFlags[2]
let $regexAndFlags :=
(: no flags :)
if (not($flags)) then
let $regex :=
if ($patternIsRegex) then $expr
else $expr ! sf:globToRegex(., 'A'[not($addAnchors)])
let $regexFlags := 'i'[$ignoreCase]
return ($regex, $regexFlags)
else
let $patternIsRegexEff :=
if ($flags ! matches(., 'r', 'i')) then contains($flags, 'r')
else $patternIsRegex
let $addAnchorsEff :=
if ($flags ! matches(., 'a', 'i')) then not(contains($flags, 'A'))
else $addAnchors
let $ignoreCaseEff :=
if ($flags ! matches(., 'c', 'i')) then not(contains($flags, 'c'))
else $ignoreCase
let $regexFlags := 'i'[$ignoreCaseEff]
let $regexExpr :=
if ($patternIsRegexEff) then $expr
else $expr ! sf:globToRegex(., 'A'[not($addAnchorsEff)])
return ($regexExpr, $regexFlags)
return $regexAndFlags
};
(:
: M a t c h s t r i n g f i l t e r
: =======================================
:)
(:~
: Matches a string against a string filter. The filter has been constructed
: by function f:compileStringFilter.
:
: @param string the string to match
: @param filter the compiled string filter
: @return true or false, if the string matches, does not match, the filter
:)
declare function sf:matchesStringFilter($string as item(),
$filter as element(stringFilter)?)
as xs:boolean {
if (empty($filter)) then true() else
let $include := $filter/include
let $exclude := $filter/exclude
return
(empty($include) or $include/sf:matchesSfPatternSet($string, .)) and
(empty($exclude) or not($exclude/sf:matchesSfPatternSet($string, .)))
};
(:~
: Matches a string against a string filter pattern set. The pattern set
: is represented by an element with the following child elements:
: - regexes: contains a sequence of <regex> elements, each with attributes @expr and @flags
: - strings: contains a sequence of strings, wrapped in <s> elements;
: matching requires that the test string is equal to one of these;
: @ignoreCase=true/false indicates case-sensitivity
: - substrings: contains a sequence of strings, wrapped in <s> elements;
: matching requires that the test string contains one of these;
: @ignoreCase=true/false indicates case-sensitivity
:
: @param string the string to match
: @param stringFilter a compiled string filter
: @return true if the string filter is matched, false otherwise
:)
declare function sf:matchesSfPatternSet($string as xs:string,
$patternSet as element())
as xs:boolean {
let $stringEff := if ($patternSet/(strings, substring)/@ignoreCase = 'true') then lower-case($string) else $string
return
$patternSet/strings/* = $stringEff
or (some $sstr in $patternSet/substrings/* satisfies
contains($stringEff, $sstr))
or (some $r in $patternSet/regexes/regex satisfies
matches($string, $r/@expr, $r/@flags))
};
(:
: U t i l i t y f u n c t i o n s
: ==================================
:)
(:~
: Splits a string into items and flags. The optional flags are separated
: from the items by a # character. Doubled # characters are interpreted as
: literal characters which do not separate items and flags.
:
: The item text is tokenized into items separated by whitespace (default)
: or one of the characters ,;:/. A non-whitespace separator is assumed if
: contained by the flag string.
:
: Example: "foo bar zoo"
: => flags="", three items="foo", "bar", "zoo"
:
: Example: "foo##bar#c"
: => flags="c", one item="foo#bar"
:
: Example: "foo bar, zoo #,c"
: => flags=",c", two items="foo bar", "zoo"
:
: Example: "foo bar; zoo #c;"
: => flags="c;", two items="foo bar", "zoo"
:
: Example: "foo:bar, zoo #:"
: => flags=":", two items="foo", "bar, zoo"
:
: @param string the string to be split
: @return a sequence of strings; the first one represents the flags, which
: may be a zero-length string; all following items represent the items
: extracted from the string
:)
declare function sf:splitStringIntoItemsAndFlags($string as xs:string,
$char as xs:string)
as xs:string+ {
let $concatAndFlags := sf:splitStringAtDoubleEscapableChar($string, $char)
let $concat := $concatAndFlags[1]
let $flags := $concatAndFlags[2]
return
if (tokenize($flags) = ('fulltext', 'ftext', 'ft')) then ($flags, $concat)
else
let $sep :=
if (not(matches($flags, '[,;:/]'))) then () else
replace($flags, '^.*([,;:/]).*', '$1') ! substring(., 1, 1)
return (
$flags,
if ($sep) then tokenize($concat, '\s*'||$sep||'\s*') else tokenize($concat))
};
(:~
: Returns the substrings preceding and following the first occurrence of a
: character ($char) which is not escaped by repeating it. (In other words:
: the first occurrence of $char which is either not repeated or repeated an
: uneven number of times.) If the string does not contain the character or
: any occurrence is repeated an even number of times), the original string
: and a zero-length string are returned.
:
: The first substring returned is edited by replacing any doubled occurrence
: of the character with a single occurrence. (Note that the second
: substring is not edited.)
:
: @param string the string to be analyzed
: @param char the character separating the substrings
: @return sequence of two strings: the string preceding and
: the string following the character
:)
declare function sf:splitStringAtDoubleEscapableChar(
$string as xs:string,
$char as xs:string)
as xs:string+ {
if (not(contains($string, $char))) then ($string, '')
else if (not(contains($string, $char||$char))) then (
substring-before($string, $char), substring-after($string, $char))
else
let $patternBefore := '^('||$char||$char||'|[^'||$char||'])+'
return
let $before := replace($string, '('||$patternBefore||').*', '$1')
let $after := substring($string, string-length($before) + 2)
return ($before ! replace(., $char||$char, $char), $after)
};
(:~
: Maps a glob pattern to a regular expression.
:
: @param glob a glob pattern
: @param flags flags controlling the evaluation;
: @return the equivalent regular expession
:)
declare function sf:globToRegex($glob as xs:string, $flags as xs:string?)
as xs:string {
let $addAnchors := not(contains($flags, 'A')) return
$glob
! replace(., '\\s', ' ')
! replace(., '[.+|\\(){}\[\]\^$]', '\\$0')
! replace(., '\*', '.*')
! replace(., '\?', '.')
! (if ($addAnchors) then concat('^', ., '$') else .)
};