-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathzawk
executable file
·322 lines (290 loc) · 11.5 KB
/
zawk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
#!/bin/bash
# This uses bash for arrays, pattern substitution, and $'...' ANSI C quoting
help() { cat <</help
Usage: ${0##*/} [OPTIONS] [--] 'PROGRAM TEXT' [ASSIGNMENT...] [FILE...]
${0##*/} [OPTIONS] -f PROGRAM_FILE [-f...] [--] [ASSIGNMENT...] [FILE...]
Decompress input(s) and pass to awk with proper FILENAME, FNR, and NR variables.
This works IN-LINE so you can stream things properly.
--awk=COMMAND Use COMMAND instead of \`awk\` (not in mawk)
-F REGEX Set the field separator, FS, to REGEX (like \`-v FS=REGEX\`)
-f FILE Program text is read from FILE instead of command line
-v ASSIGNMENT Creates variable VAR and assigns VALUE to it
-- This is the final option
FILE A file to process (default is \`-\`, which is \`/dev/stdin\`)
ASSIGNMENT As -v but potentially after running FILE(s)
WARNING: Delayed assignment is not implemented yet
Extra awk variables:
UNZIPPER The command used to decompress the current FILENAME
BUG1: If a piped input is EXACTLY 15 bytes, this will awkwardly wait for more.
BUG2: ASSIGNMENTs cannot be made between reading FILEs. They're all made first.
BUG3: GNU/gawk's \`nextfile\` and other file-specific extensions won't work.
/help
version
}
version() {
echo "Wraps awk ($("$AWK" --version 2>/dev/null |grep -m1 '[0-9]' \
|| echo "'$AWK' does not appear to report its own version"))"
echo "Part of misc-scripts: https://github/adamhotep/misc-scripts"
echo "zawk 0.4.20240807.1 Copyright 2010+ by Adam Katz, Apache License 2.0"
exit
}
# Usage: warn MESSAGE
# Report 'zawk: MESSAGE' to standard error
warn() {
echo "${0##*/}: ${*:-WARNING}" >&2
}
# Usage: die MESSAGE
# As `warn MESSAGE` but also exit everything with an error code
die() {
warn "${*:-ABORTING}"
exit 2
}
# Usage: mktemp
# A safe temporary file (some versions of mktemp require the structure)
mktemp() {
command mktemp "$@" 2>/dev/null \
|| command mktemp "$@" "${TMPDIR:-/tmp}/tmp.XXXXXXXXXX" 2>/dev/null
}
# Usage: we_have COMMAND[...]
# True when we have all given command(s). No output.
we_have() {
command -v "$@" >/dev/null 2>&1
}
# Usage: find_unzipper FILE
# Name the utility that can unzip given FILE, die if we don't have it
find_unzipper() {
local UNZIPPER=cat
case "$(file - < "${1:-/dev/stdin}")" in
( *": bzip2 "* ) UNZIPPER=bunzip2 ;;
( *": compress'd"*) UNZIPPER=uncompress ;;
( *": gzip "* ) UNZIPPER=gunzip ;;
( *": LZ4 "* ) UNZIPPER=unlz4 ;;
( *": LZMA "* ) UNZIPPER=unlzma ;;
( *": XZ "* ) UNZIPPER=unxz ;;
( *": zlib "* ) UNZIPPER=gunzip ;;
( *": Zip "* ) UNZIPPER=gunzip ;; # what about multi-file zips?
( *": Zstandard "*) UNZIPPER=unzstd ;;
( *": data" ) UNZIPPER=gunzip ;; # zip is messy, assume it here
esac
if ! we_have "$UNZIPPER"; then
die "could not find '$UNZIPPER' to decompress '$1'"
fi
echo "$UNZIPPER"
}
# not posix but present nearly everwhere (GNU coreutils, BSD). Fallback:
if ! we_have readlink; then
# Usage: readlink [-f] LINK
# Print the target of given LINK, keep following given -f, limit=100 (~0.33s)
readlink() {
local CANONICALIZE= FILE="$1" TARGET= i=0
case "$1" in ( -f | --canonicalize ) CANONICALIZE=1 FILE="$2" ;; esac
if [[ $FILE != /* ]]; then FILE="$PWD/$FILE"; fi
while [[ -L "$FILE" && "$((i+=1))" -le 100 ]]; do
TARGET="$(ls -dl "$FILE")" || break
TARGET="${TARGET#* $FILE -> }"
if [[ "$TARGET" != /* ]]; then TARGET="${FILE%/*}/$TARGET"; fi
if [[ -e "$TARGET" ]]; then FILE="$TARGET"; else break; fi
if [[ -z "$CANONICALIZE" ]]; then break; fi
done
echo "$FILE"
}
fi
# Usage: shorten FILE
# Show the most concise location for FILE, relative is okay
shorten() {
local PTH="$1" UP="${PWD%/*}" UP2="${PWD%/*/*}" RP PR
UP="../${1#$UP/}" UP2="../../${1#$UP2/}"
if we_have realpath; then # full relative path to present working dir
RP="$(realpath --relative-to="$PWD" "$1" 2>/dev/null)"
fi
if [ "${BASH_VERSION#4.[4-9]}" != "${BASH_VERSION#[5-9].}" ]; then # 4.4 - 9.9
PR="${1%/*}"
if [ -d "$PR" ]; then
PR="$(cd "$PR"; PR="\w"; echo "${PR@P}/${1##*/}")" # bash prompt abbrev
else
PR="$1"
fi
fi
for P in "${1#$PWD/}" "~/${1#$HOME/}" "$UP" "$UP2" "$RP" "$PR"; do
# Note, -ef (same device & inode) is bash/dash/zsh/ksh but not POSIX
# Tested with `[[ /dev/stdin -ef "$(readlink -f /dev/stdin)" ]]`
# * works on bash+udev (Bash 5.0.11, Linux 5.3.0)
# * FAILS on bash+devfs (Bash 4.4.23, FreeBSD 11.2)
if [ "${#P}" -lt "${#PTH}" ] && [ "$P" -ef "$PTH" ] 2>/dev/null; then
PTH="$P"
fi
done
echo "$PTH"
}
# Usage: unlink LINK
# Traverse LINK up to two levels and make the result nicer-looking
unlink() {
local F2= F="$(readlink "$1")"
F2="$(readlink "${F:=$1}")"
F="${F2:-$F}"
case "${F:=$1}" in
( /proc/self/* ) F="${F#/proc/self/}" ;; # /proc/self/fd/0 -> fd/0
( /dev/fd/* ) F="${F#/dev/}" ;; # /dev/df/0 -> fd/0
( * ) F="$(shorten "$F")" ;;
esac
echo "${F:-$1}"
}
# Usage: decompress FILE
# Print file metadata (see $FILE_KEEPER later) and the decompressed FILE
decompress() {
local UNZIPPER F FILE="${1:--}" FILENAME="${1:--}" RETVAL
: "decompress $*"
if [ "$FILE" = "-" ]; then FILE=/dev/stdin; fi
# character device, named pipe, or symlink to either
if [ -c "$FILE" ] || [ -p "$FILE" ]; then
# This used to save the contents in a variable, but we can't save NUL chars
if [ -z "$HEAD" ]; then HEAD="$(mktemp)"; fi
head -c15 "$FILE" > "$HEAD" # save first 15 bytes to examine for file type
UNZIPPER="$(find_unzipper "$HEAD")"
FILENAME="$(unlink "$FILE")"
# BUG3 - To implement `nextfile`, background the unzipper and pass its PID
# on this line (so we can kill it from awk). Problem: we don't know it yet!
printf "%s%s%s%s\n" "$_ZAWK_FS" "$FILENAME" "$_ZAWK_D" "$UNZIPPER" #metadata
{
cat "$HEAD" # put back those first 15 bytes
if [ "$(wc -c < "$HEAD")" -ge 15 ]; then # only if we read 15 bytes
: 'BUG1 - we are running `cat` assuming there is a 16th byte to read'
# This is a minimal risk; a blank gzip is 20B and blank xz is 32B
cat "$FILE" # read the rest of the pipe
fi
} | "$UNZIPPER"
RETVAL=$?
FILE="$FILENAME" # correct file name for the warning given exit code 1
else # input is a file or something that we'll let the decompressor error on
UNZIPPER="$(find_unzipper "$FILE")"
if [ "$FILE" = "/dev/stdin" ]; then
FILENAME="$(unlink "$FILE")"
fi
printf "%s%s%s%s\n" "$_ZAWK_FS" "$FILENAME" "$_ZAWK_D" "$UNZIPPER" #metadata
"$UNZIPPER" < "$FILE"
RETVAL=$?
fi
# some debug information (run me as `bash -x zawk ...`)
case $RETVAL in
( 0 ) : "decompressor exited normally (success)" ;;
( 1 ) warn "Warning: error decompressing '$FILE'" ;;
( 141 ) : "decompressor prematurely truncated" ;;
( * ) : "unknown decompressor error $RETVAL" ;;
esac
return $RETVAL
}
# Usage: program_file
# Push the FILE_KEEPER metadata handler code as a program-file before others
program_file() {
if [ -z "$PROGRAM_FILE" ]; then
PROGRAM_FILE="$(mktemp)"
echo "$FILE_KEEPER" > "$PROGRAM_FILE"
OPTS+=(-f "$PROGRAM_FILE")
fi
}
# Usage: valid_assignment STRING
# True when given a valid awk assignment, exit with error when not
valid_assignment() {
VAR="${1%%=*}"
# there is an `=` and VAR neither starts with a digit nor has an invalid char
[ "$VAR" != "$1" ] && [ "${VAR#[0-9]}" = "${VAR#*[^A-Za-z_0-9]}" ]
return $?
}
# AWK code to handle metadata and control FILENAME, FNR, NR and UNZIPPER:
# If the first 2 chars are _ZAWK_FS && split the rest by _ZAWK_D with > 1 field:
# If not on the first line ever, save FILENAME and UNZIPPER in ZAWK_DATA
# Set FILENAME to the first field; set UNZIPPER to the second field
# Set the file's number of records (FNR) to zero so line 1 increments properly
# Decrement the overall number of records (NR) because this one is metadata
# Done with metadata line, advance to line FNR=1 of this input
# (No, this is not a problem if it precedes a BEGIN clause or function.)
#
# BUG3 - aside from the chicken & egg problem noted in decompress(), this would
# simply be a new function (put on line one):
# function nextfile() { system("kill " ZAWK_DATA[3]); }
FILE_KEEPER='
substr($0, 1, 2) == _ZAWK_FS && split(substr($0, 3), ZAWK_DATA, _ZAWK_D) > 1 {
if (NR > 1) { ZAWK_DATA[-1] = FILENAME; ZAWK_DATA[-2] = UNZIPPER; }
FILENAME = ZAWK_DATA[1];
UNZIPPER = ZAWK_DATA[2];
FNR = 0;
NR--;
getline
}
' # WARNING, THE ABOVE AWK CODE IS COLLAPSED INTO ONE LINE. DO NOT ADD COMMENTS!
declare -a OPTS
_ZAWK_FS=$'\x1c\x01' # file separator, start-of-heading
_ZAWK_D=$'\x02\x1f' # delimiter: start-of-text, unit separator
AWK="awk"
PROGRAM_TEXT= PROGRAM_FILE= TMP=
trap 'rm -f $TMP $HEAD $PROGRAM_FILE' 0 1 2 3 4 5 6 9 11 15 18
# most of these options are from gawk and will cause an error in other awks
while getopts bcCd:D:e:E:f:F:ghi:l:L:MnNo:Op:PrsStv:VW:-: OPT; do
case "$OPT$OPTARG" in
h|[-W]help* ) help ;;
[-W]usage ) help |awk '$1 == "Decompress" { exit } 1'; exit ;;
[-W]vers* ) version ;;
-awk=?* ) AWK="${OPTARG#*=}" ;;
-awk* ) die "option --awk=COMMAND lacks a COMMAND" ;;
- ) break ;;
f*|-file=?* ) program_file; OPTS+=("-$OPT$OPTARG") ;;
\? ) die ;; # (error already provided by getopts)
v* ) valid_assignment "$OPTARG" || die "improper assignment: $OPTARG"
OPTS+=("-$OPT$OPTARG") ;;
F ) OPTS+=("-$OPT" "$OPTARG") ;; # support an empty FS
* ) OPTS+=("-$OPT$OPTARG") ;;
esac
done # the shift command for getopts is later
shift $((OPTIND-1))
if ! we_have "$AWK"; then
die "Invalid awk command '$AWK'"
fi
# push _ZAWK_FS and _ZAWK_D variables into awk
OPTS+=(-v _ZAWK_FS="$_ZAWK_FS" -v _ZAWK_D="$_ZAWK_D")
# If no program file (which we'd have already populated), append the command
if [ -z "$PROGRAM_FILE" ]; then
n=$'\n' # pure newline; ${A//\n/} fails but ${A//$n/} works
# Metadata handler is on one line so awk errors have the right line numbers
PROGRAM_TEXT="${FILE_KEEPER//$n/} ${1:-1}"
shift
fi
# awk also allows an ASSIGNMENT instead of a FILE:
while [ -n "$1" ] && valid_assignment "$1"; do
: "while valid_assignment $1"
OPTS[${#OPTS[@]}]="-v"; OPTS[${#OPTS[@]}]="$1"
shift
done
# BUG2: ASSIGNMENTs _between_ FILEs don't work. For example, we can't do this:
# $ date |awk 'FNR < 2 { print x, y, $0 }' x=1 - y=2 test
# 1 Thu 06 Jun 2019 05:49:09 PM EDT
# 1 2 test file output
# I can add assignments in the file metadata, but awk doesn't have `eval` (or
# anything safer), so the best I'd be able to do is to create a new associative
# array and convert e.g. `x=1` into `ZAWK[x] = 1`
#
# Instead, this seeks assignments, sets them, and prunes them from the FILE list
BEEN_THERE=0
for ARGUMENT in "$@"; do
: "for \$ARGUMENT=$ARGUMENT \$@=$@"
if [ "$BEEN_THERE" = 0 ]; then
unset BEEN_THERE
set --
fi
if valid_assignment "$ARGUMENT"; then
: "BUG2 - An ASSIGNMENT cannot be made between reading FILEs"
warn "WARNING: '$ARGUMENT' will be assigned BEFORE opening any files"
OPTS[${#OPTS[@]}]="-v"; OPTS[${#OPTS[@]}]="$ARGUMENT"
else
set -- "$@" "$ARGUMENT"
fi
done
OPTS[${#OPTS[@]}]="$PROGRAM_TEXT" # append this after all variables
TMP="$(mktemp)" # this is solely to track the last byte of each input
for FILE in "${@:--}"; do
: "FILE=$FILE"
# add line break after files that lacked them
if [ -z "$TMP" ] && [ "$(tail -c1 "$TMP")x" != $'\nx' ]; then
printf $'\n'
fi
decompress "$FILE" |tee "$TMP"
done | "$AWK" "${OPTS[@]}"