diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index db732166..e6e56dad 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -1,26 +1,25 @@ -# This file is autogenerated by maturin v1.1.0 -# To update, run -# -# maturin generate-ci github -# name: CI on: - push: - branches: - - main - - master - tags: - - '*' - pull_request: - workflow_dispatch: + push: + branches: + - main + - master + tags: + - '*' + pull_request: + workflow_dispatch: + permissions: contents: read jobs: - linux: + build-linux: runs-on: ubuntu-latest + env: + CC: gcc-11 + CXX: g++-11 strategy: matrix: target: [x86_64, x86, aarch64, armv7] @@ -29,20 +28,28 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.10' + - name: Setup environment + run: | + sudo apt-get update + sudo apt-get install --yes --upgrade build-essential cmake protobuf-compiler libssl-dev glibc-source + - name: Install 32bit version of libc + if: ${{ matrix.target == 'x86' }} + run: | + sudo apt-get install --yes --upgrade libc6-dev-i386 - name: Build wheels uses: PyO3/maturin-action@v1 with: target: ${{ matrix.target }} args: --release --out dist --find-interpreter - sccache: 'true' - manylinux: auto + sccache: ${{ matrix.target == 'x86' && 'false' || 'true' }} + manylinux: ${{ matrix.target == 'x86' && 'auto' || '2_28' }} - name: Upload wheels uses: actions/upload-artifact@v3 with: name: wheels path: dist - windows: + build-windows: runs-on: windows-latest strategy: matrix: @@ -58,14 +65,14 @@ jobs: with: target: ${{ matrix.target }} args: --release --out dist --find-interpreter - sccache: 'true' + sccache: true - name: Upload wheels uses: actions/upload-artifact@v3 with: name: wheels path: dist - macos: + build-macos: runs-on: macos-latest strategy: matrix: @@ -80,7 +87,7 @@ jobs: with: target: ${{ matrix.target }} args: --release --out dist --find-interpreter - sccache: 'true' + sccache: true - name: Upload wheels uses: actions/upload-artifact@v3 with: @@ -106,7 +113,7 @@ jobs: name: Release runs-on: ubuntu-latest if: "startsWith(github.ref, 'refs/tags/')" - needs: [linux, windows, macos, sdist] + needs: [build-linux, build-windows, build-macos, sdist] steps: - uses: actions/download-artifact@v3 with: diff --git a/.github/workflows/ISSUE_TEMPLATE/bug_report.yml b/.github/workflows/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 00000000..a49f285f --- /dev/null +++ b/.github/workflows/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,49 @@ +name: 🐛 Bug Report +description: Create a report to help us reproduce and fix the bug +labels: 'type/bug' + +body: +- type: markdown + attributes: + value: > + #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/allenai/dolma/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: 🐛 Describe the bug + description: | + Please provide a clear and concise description of what the bug is. + + If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example: + + ```python + # All necessary imports at the beginning + import dolma + + # A succinct reproducing example trimmed down to the essential parts: + assert False is True, "Oh no!" + ``` + + If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com. + + Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````. + + If the bug is a related to a specific project, such as the model or data, please add the corresponding label (e.g. "project/model"). + placeholder: | + A clear and concise description of what the bug is. + validations: + required: true +- type: textarea + attributes: + label: Versions + description: | + Please run the following and paste the output below. + ```sh + python --version && pip freeze + ``` + Also, please include your `Cargo.lock` file. + validations: + required: true +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/workflows/ISSUE_TEMPLATE/documentation.yml b/.github/workflows/ISSUE_TEMPLATE/documentation.yml new file mode 100644 index 00000000..676332bd --- /dev/null +++ b/.github/workflows/ISSUE_TEMPLATE/documentation.yml @@ -0,0 +1,23 @@ +name: 📚 Documentation +description: Report an issue related to documentation +labels: 'type/documentation' + +body: +- type: textarea + attributes: + label: 📚 The doc issue + description: > + A clear and concise description of what content in the docs is an issue. + + If this is a related to a specific project, such as the model or data, please add the corresponding label (e.g. "project/model"). + validations: + required: true +- type: textarea + attributes: + label: Suggest a potential alternative/fix + description: > + Tell us how we could improve the documentation in this regard. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/workflows/ISSUE_TEMPLATE/feature_request.yml b/.github/workflows/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 00000000..15d06977 --- /dev/null +++ b/.github/workflows/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,28 @@ +name: 🚀 Feature request +description: Submit a proposal/request for a new feature +labels: 'type/feature' + +body: +- type: textarea + attributes: + label: 🚀 The feature, motivation and pitch + description: > + A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too. + + If your request is a related to a specific project, such as the model or data, please add the corresponding label (e.g. "project/model"). + validations: + required: true +- type: textarea + attributes: + label: Alternatives + description: > + A description of any alternative solutions or features you've considered, if any. +- type: textarea + attributes: + label: Additional context + description: > + Add any other context or screenshots about the feature request. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/workflows/ISSUE_TEMPLATE/question.yml b/.github/workflows/ISSUE_TEMPLATE/question.yml new file mode 100644 index 00000000..6acfebc0 --- /dev/null +++ b/.github/workflows/ISSUE_TEMPLATE/question.yml @@ -0,0 +1,14 @@ +name: ❓ Question +description: Ask or propose a question +labels: 'type/question' + +body: +- type: textarea + attributes: + label: ❓ The question + description: > + A clear and concise description of the question being asked. + + If your question is a related to a specific project, such as the model or data, please add the corresponding label (e.g. "project/model"). + validations: + required: true diff --git a/Cargo.lock b/Cargo.lock index c8c6da49..d380d608 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -682,7 +682,7 @@ dependencies = [ [[package]] name = "dolma" -version = "0.6.0" +version = "0.6.1" dependencies = [ "ahash", "aws-config", diff --git a/Cargo.toml b/Cargo.toml index 69c0289c..25d8650c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "dolma" -version = "0.6.1" +version = "0.6.2" edition = "2021" license = "Apache-2.0" diff --git a/Makefile b/Makefile index 8cf76d37..ada3fe2f 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ setup: $(shell "${CMAKE_SETUP}") $(shell "${PROTOBUF_SETUP}") $(shell "${OPENSSL_SETUP}") - which cargo || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh + which cargo || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y which maturin || pip install maturin publish: @@ -53,5 +53,9 @@ develop: style: rustfmt --edition 2021 src/*.rs - autopep8 --in-place --recursive python/ && isort python/ && black python/ - autopep8 --in-place --recursive tests/python/ && isort tests/python/ && black tests/python/ + autopep8 --in-place --recursive python/ + isort python/ + black python/ + autopep8 --in-place --recursive tests/python/ + isort tests/python/ + black tests/python/ diff --git a/README.md b/README.md index 4961f4cd..321c2816 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ conda activate dolma make setup ``` -Finally, to begin development, install the repository in editable mode using maturin. +and restart your shell. Finally, to begin development, install the repository in editable mode using maturin. ```shell make develop diff --git a/pyproject.toml b/pyproject.toml index 36c10359..e1149f6a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dolma" -version = "0.6.1" +version = "0.6.2" description = "Data filters" license = {text = "Apache-2.0"} readme = "README.md" diff --git a/tests/python/test_taggers.py b/tests/python/test_taggers.py index 11ae82c6..468506c8 100644 --- a/tests/python/test_taggers.py +++ b/tests/python/test_taggers.py @@ -218,35 +218,35 @@ def test_predict_multiline(self): }, ) self.assertEqual( - d["spans"][9], {"start": 0, "end": 15, "type": "character_count", "score": 79.0, "mention": text} + d["spans"][9], {"start": 0, "end": 79, "type": "character_count", "score": 79.0, "mention": text} ) self.assertEqual( - d["spans"][10], {"start": 0, "end": 15, "type": "word_count", "score": 13.0, "mention": text} + d["spans"][10], {"start": 0, "end": 79, "type": "word_count", "score": 13.0, "mention": text} ) self.assertEqual( - d["spans"][11], {"start": 0, "end": 15, "type": "median_word_length", "score": 4.0, "mention": text} + d["spans"][11], {"start": 0, "end": 79, "type": "median_word_length", "score": 4.0, "mention": text} ) self.assertEqual( - d["spans"][12], {"start": 0, "end": 15, "type": "symbol_to_word_ratio", "score": 0.0, "mention": text} + d["spans"][12], {"start": 0, "end": 79, "type": "symbol_to_word_ratio", "score": 0.0, "mention": text} ) self.assertEqual( d["spans"][13], { "start": 0, - "end": 15, + "end": 79, "type": "fraction_of_words_with_alpha_character", "score": 1.0, "mention": text, }, ) self.assertEqual( - d["spans"][14], {"start": 0, "end": 15, "type": "required_word_count", "score": 0.0, "mention": text} + d["spans"][14], {"start": 0, "end": 79, "type": "required_word_count", "score": 0.0, "mention": text} ) self.assertEqual( d["spans"][15], { "start": 0, - "end": 15, + "end": 79, "type": "fraction_of_lines_starting_with_bullet_point", "score": 0.0, "mention": text, @@ -256,7 +256,7 @@ def test_predict_multiline(self): d["spans"][16], { "start": 0, - "end": 15, + "end": 79, "type": "fraction_of_lines_ending_with_ellipsis", "score": 0.0, "mention": text, @@ -264,13 +264,13 @@ def test_predict_multiline(self): ) self.assertEqual( d["spans"][17], - {"start": 0, "end": 15, "type": "fraction_of_duplicate_lines", "score": 0.0, "mention": text}, + {"start": 0, "end": 79, "type": "fraction_of_duplicate_lines", "score": 0.0, "mention": text}, ) self.assertEqual( d["spans"][18], { "start": 0, - "end": 15, + "end": 79, "type": "fraction_of_characters_in_duplicate_lines", "score": 0.0, "mention": text,