From cd9021ab64480fc4340da9b9992f9f0387115ba0 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Tue, 7 May 2019 15:13:33 +0200 Subject: [PATCH 1/9] add ASCII-only option, to mimic default RE2 behaviour This is a workaround, motivated by the difference in handling non-valid UTF8 bytes that Oniriguma has, compared to Go's default RE2. See https://github.com/src-d/enry/issues/225#issuecomment-490043281 Summary of changes: - c: prevent `NewOnigRegex()` from hard-coding UTF8 - c: `NewOnigRegex()` now propely calls to `onig_initialize()` [1] - go: expose new `MustCompileASCII()` \w default charecter class matching only ASCII - go: `MustCompile()` refactored, `initRegexp()` extracted for common UTF8/ASCII logic Encoding was not exposed on Go API level intentionaly for simplisity, in order to avoid introducing complex struct type [2] to API surface. 1. https://github.com/kkos/oniguruma/blob/83572e983928243d741f61ac290fc057d69fefc3/doc/API#L6 2. https://github.com/kkos/oniguruma/blob/83572e983928243d741f61ac290fc057d69fefc3/src/oniguruma.h#L121 Signed-off-by: Alexander Bezzubov --- chelper.c | 3 ++- regex.go | 27 +++++++++++++++++++++++---- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/chelper.c b/chelper.c index 7b605d1..a7e3ba1 100644 --- a/chelper.c +++ b/chelper.c @@ -17,7 +17,8 @@ int NewOnigRegex( char *pattern, int pattern_length, int option, *error_info = (OnigErrorInfo *) malloc(sizeof(OnigErrorInfo)); memset(*error_info, 0, sizeof(OnigErrorInfo)); - *encoding = (void*)ONIG_ENCODING_UTF8; + OnigEncoding use_encs[] = { *encoding }; + onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); *error_buffer = (char*) malloc(ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char)); diff --git a/regex.go b/regex.go index 9bfc0a0..0c4ee97 100644 --- a/regex.go +++ b/regex.go @@ -48,13 +48,23 @@ type Regexp struct { } func NewRegexp(pattern string, option int) (re *Regexp, err error) { - re = &Regexp{pattern: pattern} - patternCharPtr := C.CString(pattern) - defer C.free(unsafe.Pointer(patternCharPtr)) + re = &Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_UTF8} + return initRegexp(re, option) +} + +// NewRegexpASCII is equivalent of NewRegexp but matching only ASCII. +func NewRegexpASCII(pattern string, option int) (re *Regexp, err error) { + re = &Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_ASCII} + return initRegexp(re, option) +} +func initRegexp(re *Regexp, option int) (*Regexp, error) { + var err error + patternCharPtr := C.CString(re.pattern) + defer C.free(unsafe.Pointer(patternCharPtr)) mutex.Lock() defer mutex.Unlock() - error_code := C.NewOnigRegex(patternCharPtr, C.int(len(pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf) + error_code := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf) if error_code != C.ONIG_NORMAL { err = errors.New(C.GoString(re.errorBuf)) } else { @@ -95,6 +105,15 @@ func MustCompileWithOption(str string, option int) *Regexp { return regexp } +// MustCompileASCII equivalent of MustCompile but with char matching only ASCII. +func MustCompileASCII(str string) *Regexp { + regexp, error := NewRegexpASCII(str, ONIG_OPTION_DEFAULT) + if error != nil { + panic("regexp: compiling " + str + ": " + error.Error()) + } + return regexp +} + func (re *Regexp) Free() { mutex.Lock() if re.regex != nil { From 6adc40e14d0915cd2f386d871ede96cc8ff91a7d Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Tue, 7 May 2019 15:37:51 +0200 Subject: [PATCH 2/9] ci: test on 2 latest go versions Signed-off-by: Alexander Bezzubov --- .travis.yml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8c53b50..31044ed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,15 +1,17 @@ language: go +go: + - '1.11.x' + - '1.12.x' + env: global: - LD_LIBRARY_PATH="/usr/local/lib":${LD_LIBRARY_PATH} - GO111MODULE=on + addons: apt: packages: - libonig-dev -jobs: - include: - - go: 1.11.x - script: - - go test -v --cover -race \ No newline at end of file +script: + - go test -v --cover -race From fcdcc4ec26702726c1ba3dca2902d998fa4e593b Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Tue, 7 May 2019 16:00:33 +0200 Subject: [PATCH 3/9] ci: bump version of Oniguruma to 6.9.1 Update deb to get fix https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627 Signed-off-by: Alexander Bezzubov --- .travis.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 31044ed..ed8b15d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,4 @@ +dist: trusty language: go go: - '1.11.x' @@ -7,11 +8,16 @@ env: global: - LD_LIBRARY_PATH="/usr/local/lib":${LD_LIBRARY_PATH} - GO111MODULE=on - addons: apt: packages: - libonig-dev +before_install: + - sudo apt-get install -y dpkg # dpkg >= 1.17.5ubuntu5.8, which fixes https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627- sudo dpkg -i libonig-dev_6.9.1-1_amd64.deb + - wget http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig5_6.9.1-1_amd64.deb + - sudo dpkg -i libonig5_6.9.1-1_amd64.deb + - wget http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig-dev_6.9.1-1_amd64.deb + - sudo dpkg -i libonig-dev_6.9.1-1_amd64.deb script: - go test -v --cover -race From 37cb5e25da92f9cb596a7096949030a6a7341ddf Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Tue, 7 May 2019 18:59:31 +0200 Subject: [PATCH 4/9] ci: refactor oniguruma installation Signed-off-by: Alexander Bezzubov --- .travis.yml | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index ed8b15d..6d783fb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,16 +8,13 @@ env: global: - LD_LIBRARY_PATH="/usr/local/lib":${LD_LIBRARY_PATH} - GO111MODULE=on -addons: - apt: - packages: - - libonig-dev + - ONIGURUMA_VERSION='6.9.1' -before_install: - - sudo apt-get install -y dpkg # dpkg >= 1.17.5ubuntu5.8, which fixes https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627- sudo dpkg -i libonig-dev_6.9.1-1_amd64.deb - - wget http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig5_6.9.1-1_amd64.deb - - sudo dpkg -i libonig5_6.9.1-1_amd64.deb - - wget http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig-dev_6.9.1-1_amd64.deb - - sudo dpkg -i libonig-dev_6.9.1-1_amd64.deb +before_install: # install oniguruma manualy as trusty has only ancent 5.x + - sudo apt-get install -y dpkg # dpkg >= 1.17.5ubuntu5.8 fixes https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627 + - wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig5_${ONIGURUMA_VERSION}-1_amd64.deb" + - sudo dpkg -i "libonig5_${ONIGURUMA_VERSION}-1_amd64.deb" + - wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig-dev_{$ONIGURUMA_VERSION}}-1_amd64.deb" + - sudo dpkg -i "libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb" script: - go test -v --cover -race From e928d1f0b1ab65f474e6fbd2b6deb666cbd01bb9 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Tue, 7 May 2019 19:04:29 +0200 Subject: [PATCH 5/9] refactoring go part a bit, addressing review feedback Signed-off-by: Alexander Bezzubov --- regex.go | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/regex.go b/regex.go index 0c4ee97..05f8949 100644 --- a/regex.go +++ b/regex.go @@ -47,15 +47,14 @@ type Regexp struct { namedGroupInfo NamedGroupInfo } +// NewRegexp creates and initialize new Regexp with a given pattenr and option. func NewRegexp(pattern string, option int) (re *Regexp, err error) { - re = &Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_UTF8} - return initRegexp(re, option) + return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_UTF8}, option) } -// NewRegexpASCII is equivalent of NewRegexp but matching only ASCII. +// NewRegexpASCII is equivalent to NewRegexp, but with the encoding restricted to ASCII. func NewRegexpASCII(pattern string, option int) (re *Regexp, err error) { - re = &Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_ASCII} - return initRegexp(re, option) + return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_ASCII}, option) } func initRegexp(re *Regexp, option int) (*Regexp, error) { @@ -64,8 +63,8 @@ func initRegexp(re *Regexp, option int) (*Regexp, error) { defer C.free(unsafe.Pointer(patternCharPtr)) mutex.Lock() defer mutex.Unlock() - error_code := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf) - if error_code != C.ONIG_NORMAL { + errorCode := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf) + if errorCode != C.ONIG_NORMAL { err = errors.New(C.GoString(re.errorBuf)) } else { err = nil @@ -105,7 +104,7 @@ func MustCompileWithOption(str string, option int) *Regexp { return regexp } -// MustCompileASCII equivalent of MustCompile but with char matching only ASCII. +// MustCompileASCII is equivalent to MustCompile, but with the encoding restricted to ASCII. func MustCompileASCII(str string) *Regexp { regexp, error := NewRegexpASCII(str, ONIG_OPTION_DEFAULT) if error != nil { From 297272b718c5991fe1a1f791bce3678a56c12106 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 8 May 2019 14:18:32 +0200 Subject: [PATCH 6/9] ci: fix typo in bash var substitution Signed-off-by: Alexander Bezzubov --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 6d783fb..0280393 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ before_install: # install oniguruma manualy as trusty has only ancent 5.x - sudo apt-get install -y dpkg # dpkg >= 1.17.5ubuntu5.8 fixes https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627 - wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig5_${ONIGURUMA_VERSION}-1_amd64.deb" - sudo dpkg -i "libonig5_${ONIGURUMA_VERSION}-1_amd64.deb" - - wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig-dev_{$ONIGURUMA_VERSION}}-1_amd64.deb" + - wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb" - sudo dpkg -i "libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb" script: - go test -v --cover -race From 6e6a74f03c522883002d4a67395b18e497e1f120 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 8 May 2019 14:19:07 +0200 Subject: [PATCH 7/9] cgo: simplify naive encoding init Signed-off-by: Alexander Bezzubov --- chelper.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/chelper.c b/chelper.c index a7e3ba1..d768a77 100644 --- a/chelper.c +++ b/chelper.c @@ -17,8 +17,7 @@ int NewOnigRegex( char *pattern, int pattern_length, int option, *error_info = (OnigErrorInfo *) malloc(sizeof(OnigErrorInfo)); memset(*error_info, 0, sizeof(OnigErrorInfo)); - OnigEncoding use_encs[] = { *encoding }; - onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); + onig_initialize_encoding(*encoding); *error_buffer = (char*) malloc(ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char)); From af90abedd4d010fdec0f60c18175f0b433d1980e Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 8 May 2019 14:19:31 +0200 Subject: [PATCH 8/9] go: doc syntax fix Signed-off-by: Alexander Bezzubov --- regex.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex.go b/regex.go index 05f8949..cbb647c 100644 --- a/regex.go +++ b/regex.go @@ -47,7 +47,7 @@ type Regexp struct { namedGroupInfo NamedGroupInfo } -// NewRegexp creates and initialize new Regexp with a given pattenr and option. +// NewRegexp creates and initializes a new Regexp with the given pattern and option. func NewRegexp(pattern string, option int) (re *Regexp, err error) { return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_UTF8}, option) } From d1a3c51c72720e3ce47a985662ffcad3a580d8a5 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 8 May 2019 14:35:47 +0200 Subject: [PATCH 9/9] tixing fypos Signed-off-by: Alexander Bezzubov --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 0280393..29bddf1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ env: - GO111MODULE=on - ONIGURUMA_VERSION='6.9.1' -before_install: # install oniguruma manualy as trusty has only ancent 5.x +before_install: # install oniguruma manually as trusty has only ancient 5.x - sudo apt-get install -y dpkg # dpkg >= 1.17.5ubuntu5.8 fixes https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627 - wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig5_${ONIGURUMA_VERSION}-1_amd64.deb" - sudo dpkg -i "libonig5_${ONIGURUMA_VERSION}-1_amd64.deb"