Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modernize compilation of Sudachi dictionaries #44

Merged
merged 10 commits into from
Aug 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 0 additions & 36 deletions .github/workflows/release.yml

This file was deleted.

36 changes: 0 additions & 36 deletions .github/workflows/release_python.yml

This file was deleted.

19 changes: 19 additions & 0 deletions .github/workflows/test_dic_compilation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
on:
pull_request:
types: [opened, synchronize, reopened]

jobs:
build_dic:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Setup JDK 11
uses: actions/setup-java@v3
with:
java-version: 11
distribution: corretto
cache: gradle
- name: Check dictionary compilation
run:
./gradlew --stacktrace --info build
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ local.properties
# JDT-specific (Eclipse Java Development Tools)
.classpath

# IntelliJ

.idea/

### Java ###
# Compiled class file
*.class
Expand Down Expand Up @@ -177,4 +181,7 @@ gradle-app.setting
# Cache of project
.gradletasknamecache

src/main/text/*_lex.csv
src/main/text/*_lex.csv

# Python Virtual Environments for development
.venv*/
36 changes: 36 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Release process

Release process is semi-manual because you need to get new versions of raw Sudachi dictionaries.

## Get access to sudachi artifact storages

1. Sudachi AWS account (with MFA enabled)
2. Token for PyPI publication (long string, starts from `pypi-`)

## Setup release virtual environment

All folders with names starting with `.venv` are ignored in git:

```bash
python3 -m venv .venv
source .venv/bin/activate
```

Install dependencies for build environment

```bash
pip install -r scripts/requirements.txt
```

## Use basic release script

```bash
bash do_release.sh /path/to/csv/dics DIC_VERSION aws-profile arn:aws:iam::0123456789:mfa/iam_user
```

Arguments (positional):
1. Path to csv dictionaries, should contain small_lex.zip, core_lex.zip, notcore_lex.zip files
2. Version for new release (as dictionaries will be uploaded with)
3. Configured profile for AWS for Sudachi
4. MFA arn for the user
5. (optional) version string for Python package
111 changes: 84 additions & 27 deletions build.gradle
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
plugins {
id 'java-library'
id "de.undercouch.download" version "5.4.0"
}

apply from: 'gradle/version.gradle'
Expand All @@ -9,48 +10,104 @@ repositories {
}

dependencies {
implementation 'com.worksap.nlp:sudachi:0.5.1'
implementation 'com.worksap.nlp:sudachi:0.7.2'
testImplementation 'junit:junit:4.13.1', 'org.hamcrest:hamcrest:2.1'
}

def dictionarySrcDir = 'src/main/text'
def dictionarySrc = [ small: ['small'], core: ['small', 'core'], full: ['small', 'core', 'notcore']]
def dictionarySrc = [ small: ['small'], core: ['small', 'core'], 'full': ['small', 'core', 'notcore']]

task unzipMatrixDef(type: Copy) {
def zipFile = file("${dictionarySrcDir}/matrix.def.zip")
def outputdir = file('build')
def downloadCacheDir = new File(project.buildDir, "cache")

def matrixDefDownload = tasks.register("downloadMatrixDef", Download) {
src "https://d2ej7fkh96fzlu.cloudfront.net/sudachidict-raw/matrix.def.zip"
dest new File(downloadCacheDir, "matrix.def.zip")
overwrite false
}

def unzipMatrixDef = tasks.register('unzipMatrixDef', Copy) {
def zipFile = matrixDefDownload.get().outputs.files.singleFile
def outputDir = new File(project.buildDir, "dict/raw/matrix")
from zipTree(zipFile)
into outputdir
into outputDir
dependsOn(matrixDefDownload)
inputs.file(zipFile)
}

dictionarySrc.each {
def name = it.key
def taskName = "build${name.capitalize()}Dict"
def sources = it.value.collect { "${dictionarySrcDir}/${it}_lex.csv" }
task "${taskName}"(type: JavaExec) {
main = "com.worksap.nlp.sudachi.dictionary.DictionaryBuilder"
classpath = sourceSets.main.runtimeClasspath
args('-o', "build/system_${name}.dic", '-m', 'build/matrix.def', '-d', "${version}", *sources)
maxHeapSize = "4g"
dependsOn unzipMatrixDef
def rawDictSrcs = dictionarySrc.collectMany { it.value }.toSet()

rawDictSrcs.forEach { name ->
final String capitalName = name.capitalize()
final String version = project.property("dict.version").toString()

def downloadTask = tasks.register("download${capitalName}Zip", Download) {
def filename = "${name}_lex.zip"
src "https://d2ej7fkh96fzlu.cloudfront.net/sudachidict-raw/${version}/$filename"
dest new File(downloadCacheDir, "${version}/$name/$filename")
overwrite false
inputs.property("version", version)
}

tasks.register("unzip${capitalName}Dict", Copy) {
def zipFile = downloadTask.get().outputs.files.singleFile
def outputDir = new File(project.buildDir, "dict/raw/$version/$name")
from zipTree(zipFile)
into outputDir
dependsOn(downloadTask)
}
test.dependsOn taskName
}

dictionarySrc.each {
def builtDictDir = new File(project.buildDir, "dict/bin/${property("dict.version")}")

dictionarySrc.entrySet().forEach { e ->
def capitalName = e.key.capitalize()
def version = project.property("dict.version").toString()
def name = e.key

def compileTask = tasks.register("compile${capitalName}Dict", JavaExec) { t ->
def sources = e.value.collect { source ->
def srcName = source.toString().capitalize()
t.dependsOn(tasks.named("unzip${srcName}Dict"))
def dictCsv = new File(project.buildDir, "dict/raw/$version/$source/${source}_lex.csv")
t.inputs.file(dictCsv)
dictCsv
}

def outputFile = new File(builtDictDir, "system_${name}.dic")
def matrixFile = new File(unzipMatrixDef.get().outputs.files.singleFile, "matrix.def")

t.mainClass.set("com.worksap.nlp.sudachi.dictionary.DictionaryBuilder")
t.classpath = sourceSets.main.runtimeClasspath
t.args('-o', outputFile.toString(), '-m', matrixFile, '-d', "${version}", *sources)
t.maxHeapSize = "4g"
t.dependsOn unzipMatrixDef
t.systemProperty('file.encoding', 'UTF-8')
t.inputs.file(matrixFile)
t.outputs.file(outputFile)
}
test.dependsOn(compileTask)
}

dictionarySrc.entrySet().forEach {
def name = it.key
def taskName = "zip${name.capitalize()}Dict"
task "${taskName}"(type: Zip) {
def task = tasks.register("zip${name.capitalize()}Dict", Zip) {
def compileTask = tasks.getByName("compile${name.capitalize()}Dict")
def compiledDict = compileTask.outputs.files.singleFile

archiveBaseName = 'sudachi-dictionary'
archiveVersion = version
archiveVersion.set(version)
archiveClassifier = name
from "build/system_${name}.dic", 'LEGAL', 'LICENSE-2.0.txt'
from compiledDict, 'LEGAL', 'LICENSE-2.0.txt'
into "sudachi-dictionary-${version}"
dependsOn "build${name.capitalize()}Dict"
dependsOn(compileTask)
}
build.dependsOn taskName
tasks.named('build').configure { dependsOn(task) }
}

tasks.withType(Test) {
systemProperty('buildDirectory', 'build')
tasks.withType(JavaCompile).configureEach {
options.encoding = 'UTF-8'
}

tasks.withType(Test).configureEach {
systemProperty('buildDirectory', builtDictDir.toString())
systemProperty('file.encoding', 'UTF-8')
}
37 changes: 37 additions & 0 deletions do_release.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env bash

# This is a script for doing a SIMPLE release
# How to run: do_release.sh DICT_VERSION /path/to/directory/with/raw/dictionaries aws_profile aws_mfa_uid
# You also need to have prepared python virtual environment with dependencies
# See RELEASE.md for details

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

DICT_VERSION=$1
RAW_DIC_PATH=$2
AWS_PROFILE=$3
AWS_MFA_ID=$4
if [ -z "$5" ]; then
PY_PACKAGE_VERSION=${DICT_VERSION}
else
PY_PACKAGE_VERSION=$5
fi

# upload dictionary csvs to s3
python3 "$SCRIPT_DIR/scripts/01_upload_raw_dictionaries.py" \
--input="$RAW_DIC_PATH" \
--version="$DICT_VERSION" \
--aws_profile="$AWS_PROFILE" \
--aws_mfa="$AWS_MFA_ID"

# build binary dictionaries
"$SCRIPT_DIR/gradlew" -Pdict.release=true -Pdict.version="$DICT_VERSION" build

# upload binary dictionaries to s3
python3 "$SCRIPT_DIR/scripts/02_upload_compiled_dictionaries.py" \
--input="$SCRIPT_DIR/build/distributions" \
--aws_profile="$AWS_PROFILE" \
--aws_mfa="$AWS_MFA_ID"

# build python distributions
bash $SCRIPT_DIR/package_python.sh "$DICT_VERSION" "$PY_PACKAGE_VERSION"
2 changes: 2 additions & 0 deletions gradle.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
dict.version=20230110
dict.release=false
39 changes: 30 additions & 9 deletions gradle/version.gradle
Original file line number Diff line number Diff line change
@@ -1,18 +1,39 @@
buildscript {
repositories.mavenCentral()
dependencies.classpath 'org.ajoberstar:grgit:1.7.2'
repositories {
mavenCentral()
}
dependencies {
classpath 'org.ajoberstar.grgit:grgit-core:5.2.0'
}
}

String commitFromGit() {
def git = org.ajoberstar.grgit.Grgit.open {
dir = file('.')
}
def describedCommit = git.describe().toString().trim().replaceFirst('\\Av', '')
String suffix1 = ""
if (describedCommit.matches(".*-[0-9]+-g[0-9a-f]{7}")) {
suffix1 = "-SNAPSHOT"
}
String suffix2 = ""
if (!git.status().isClean()) {
suffix2 = "+dirty"
}
return describedCommit + suffix1 + suffix2
}

ext {
git = org.ajoberstar.grgit.Grgit.open(file('.'))
describedCommit = git.describe().toString().trim().replaceFirst('\\Av', '')
String versionString() {
if (property("dict.release") == "true") {
return property("dict.version")
} else {
return commitFromGit()
}
}

version = describedCommit +
(describedCommit.matches(".*-[0-9]+-g[0-9a-f]{7}") ? "-SNAPSHOT" : "") +
(git.status().isClean() ? "" : "+dirty")
version = versionString()

task showVersion {
tasks.register("showVersion") {
doLast {
println version
}
Expand Down
Binary file modified gradle/wrapper/gradle-wrapper.jar
Binary file not shown.
3 changes: 2 additions & 1 deletion gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-6.7-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.2.1-bin.zip
networkTimeout=10000
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
Loading