Add CleanCoNLL object #2420
Annotations
6 errors and 1 warning
Run tests:
flair/__init__.py#L1
mypy-status
mypy exited with status 1.
|
Run tests:
flair/datasets/__init__.py#L341
ruff
pytest_ruff.RuffError: flair/datasets/__init__.py:2:1: I001 [*] Import block is un-sorted or un-formatted
|
1 | # Expose base classses
2 | / from .base import (
3 | | DataLoader,
4 | | FlairDatapointDataset,
5 | | MongoDataset,
6 | | SentenceDataset,
7 | | StringDataset,
8 | | )
9 | |
10 | | # Expose all biomedical data sets used for the evaluation of BioBERT
11 | | # -
12 | | # -
13 | | # -
14 | | # -
15 | | # Expose all biomedical data sets using the HUNER splits
16 | | # Expose all biomedical data sets
17 | | from .biomedical import (
18 | | ANAT_EM,
19 | | AZDZ,
20 | | BC2GM,
21 | | BIO_INFER,
22 | | BIOBERT_CHEMICAL_BC4CHEMD,
23 | | BIOBERT_CHEMICAL_BC5CDR,
24 | | BIOBERT_DISEASE_BC5CDR,
25 | | BIOBERT_DISEASE_NCBI,
26 | | BIOBERT_GENE_BC2GM,
27 | | BIOBERT_GENE_JNLPBA,
28 | | BIOBERT_SPECIES_LINNAEUS,
29 | | BIOBERT_SPECIES_S800,
30 | | BIONLP2013_CG,
31 | | BIONLP2013_PC,
32 | | BIOSEMANTICS,
33 | | CDR,
34 | | CELL_FINDER,
35 | | CEMP,
36 | | CHEMDNER,
37 | | CLL,
38 | | CRAFT,
39 | | CRAFT_V4,
40 | | DECA,
41 | | FSU,
42 | | GELLUS,
43 | | GPRO,
44 | | HUNER_CELL_LINE,
45 | | HUNER_CELL_LINE_CELL_FINDER,
46 | | HUNER_CELL_LINE_CLL,
47 | | HUNER_CELL_LINE_GELLUS,
48 | | HUNER_CELL_LINE_JNLPBA,
49 | | HUNER_CHEMICAL,
50 | | HUNER_CHEMICAL_CDR,
51 | | HUNER_CHEMICAL_CEMP,
52 | | HUNER_CHEMICAL_CHEBI,
53 | | HUNER_CHEMICAL_CHEMDNER,
54 | | HUNER_CHEMICAL_CRAFT_V4,
55 | | HUNER_CHEMICAL_SCAI,
56 | | HUNER_DISEASE,
57 | | HUNER_DISEASE_CDR,
58 | | HUNER_DISEASE_MIRNA,
59 | | HUNER_DISEASE_NCBI,
60 | | HUNER_DISEASE_PDR,
61 | | HUNER_DISEASE_SCAI,
62 | | HUNER_DISEASE_VARIOME,
63 | | HUNER_GENE,
64 | | HUNER_GENE_BC2GM,
65 | | HUNER_GENE_BIO_INFER,
66 | | HUNER_GENE_CELL_FINDER,
67 | | HUNER_GENE_CHEBI,
68 | | HUNER_GENE_CRAFT_V4,
69 | | HUNER_GENE_DECA,
70 | | HUNER_GENE_FSU,
71 | | HUNER_GENE_GPRO,
72 | | HUNER_GENE_IEPA,
73 | | HUNER_GENE_JNLPBA,
74 | | HUNER_GENE_LOCTEXT,
75 | | HUNER_GENE_MIRNA,
76 | | HUNER_GENE_OSIRIS,
77 | | HUNER_GENE_VARIOME,
78 | | HUNER_SPECIES,
79 | | HUNER_SPECIES_CELL_FINDER,
80 | | HUNER_SPECIES_CHEBI,
81 | | HUNER_SPECIES_CRAFT_V4,
82 | | HUNER_SPECIES_LINNEAUS,
83 | | HUNER_SPECIES_LOCTEXT,
84 | | HUNER_SPECIES_MIRNA,
85 | | HUNER_SPECIES_S800,
86 | | HUNER_SPECIES_VARIOME,
87 | | IEPA,
88 | | JNLPBA,
89 | | LINNEAUS,
90 | | LOCTEXT,
91 | | MIRNA,
92 | | NCBI_DISEASE,
93 | | OSIRIS,
94 | | PDR,
95 | | S800,
96 | | SCAI_CHEMICALS,
97 | | SCAI_DISEASE,
98 | | VARIOME,
99 | | )
100 | |
101 | | # Expose all document classification datasets
102 | | from .document_classification import (
103 | | AGNEWS,
104 | | AMAZON_REVIEWS,
105 | | COMMUNICATIVE_FUNCTIONS,
106 | | GERMEVAL_2018_OFFENSIVE_LANGUAGE,
107 | | GLUE_COLA,
108 | | GLUE_SST2,
109 | | GO_EMOTIONS,
110 | | IMDB,
111 | | NEWSGROUPS,
112 | | SENTEVAL_CR,
113 | | SENTEVAL_MPQA,
114 | | SENTEVAL_MR,
115 | | SENTEVAL_SST_BINARY,
116 | | SENTEVAL_SST_GRANULAR,
117 | | SENTEVAL_SUBJ,
118 | | SENTIMENT_140,
119 | | STACKOVERFLOW,
120 | | TREC_6,
121 | | TREC_50,
122 | | WASSA_ANGER,
123 | | WASSA_FEAR,
124 | | WASSA_JOY,
125 | | WASSA_SADNESS,
126 | | YAHOO_ANSWERS,
127 | | ClassificationCorpus,
128 | | ClassificationDataset,
129 | | CSVClassificationCorpus,
130 | | CSVClassificationDataset,
131 | | )
132 | |
133 | | # word sense disambiguation
134 | | # Expose all entity linking datasets
135 | | from .entity_linking import (
136 | | CTD_CHEMICALS_DICTIONARY,
137 | | CTD_DISEASES_DICTIONARY,
138 | | NCBI_GENE_HUMAN_DICTIONARY,
139 | | NCBI_TAXONOMY_DICTIO
|
Run tests:
flair/datasets/sequence_labeling.py#L341
ruff
pytest_ruff.RuffError: flair/datasets/sequence_labeling.py:1:1: I001 [*] Import block is un-sorted or un-formatted
|
1 | / import copy
2 | | import json
3 | | import logging
4 | | import os
5 | | import re
6 | | #import shutil
7 | | from collections import defaultdict
8 | | from pathlib import Path
9 | | import tempfile
10 | | import shutil
11 | | import requests
12 | | import zipfile
13 | | import subprocess
14 | | from typing import (
15 | | Any,
16 | | DefaultDict,
17 | | Dict,
18 | | Iterable,
19 | | Iterator,
20 | | List,
21 | | Optional,
22 | | Tuple,
23 | | Union,
24 | | cast,
25 | | )
26 | |
27 | | from torch.utils.data import ConcatDataset, Dataset
28 | |
29 | | import flair
30 | | from flair.data import (
31 | | Corpus,
32 | | FlairDataset,
33 | | MultiCorpus,
34 | | Relation,
35 | | Sentence,
36 | | Token,
37 | | get_spans_from_bio,
38 | | )
39 | | from flair.datasets.base import find_train_dev_test_files
40 | | from flair.file_utils import cached_path, unpack_file
41 | | from flair.tokenization import Tokenizer
42 | |
43 | | log = logging.getLogger("flair")
| |_^ I001
|
= help: Organize imports
flair/datasets/sequence_labeling.py:13:8: F401 [*] `subprocess` imported but unused
|
11 | import requests
12 | import zipfile
13 | import subprocess
| ^^^^^^^^^^ F401
14 | from typing import (
15 | Any,
|
= help: Remove unused import: `subprocess`
flair/datasets/sequence_labeling.py:1441:9: D212 [*] Multi-line docstring summary should start at the first line
|
1439 | **corpusargs,
1440 | ) -> None:
1441 | """
| _________^
1442 | | Initialize the CleanCoNLL corpus.
1443 | |
1444 | | Args:
1445 | | base_path: Base directory for the dataset. If None, defaults to flair.cache_root / "datasets".
1446 | | in_memory: If True, keeps dataset in memory for faster training.
1447 | | """
| |___________^ D212
1448 | # Set the base path for the dataset
1449 | base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
|
= help: Remove whitespace after opening quotes
flair/datasets/sequence_labeling.py:1465:35: Q000 [*] Single quotes found but double quotes preferred
|
1464 | # Check if the train data file exists, otherwise download and prepare the dataset
1465 | train_set = data_folder / 'cleanconll.train'
| ^^^^^^^^^^^^^^^^^^ Q000
1466 |
1467 | if not train_set.exists():
|
= help: Replace single quotes with double quotes
flair/datasets/sequence_labeling.py:1490:13: D200 One-line docstring should fit on one line
|
1489 | def parse_patch(patch_file_path):
1490 | """
| _____________^
1491 | | Parses a patch file and returns a structured representation of the changes.
1492 | | """
| |_______________^ D200
1493 |
1494 | changes = []
|
= help: Reformat to one line
flair/datasets/sequence_labeling.py:1490:13: D202 [*] No blank lines allowed after function docstring (found 1)
|
1489 | def parse_patch(patch_file_path):
1490 | """
| _____________^
1491 | | Parses a patch file and returns a structured representation of the changes.
1492 | | """
| |_______________^ D202
1493 |
1494 | changes = []
|
= help: Remove blank line(s) after function docstring
flair/datasets/sequence_labeling.py:1490:13: D212 [*] Multi-line docstring summary should start at the first line
|
1489 | def parse_patch(patch_file_path):
1490 | """
| _____________^
1491 | | Parses a patch file and returns a structured representation of the changes.
1492 | | """
| |_______________^ D212
1493 |
1494 | changes = []
|
= help: Remove whitespace after opening quotes
fl
|
Run tests:
flair/datasets/sequence_labeling.py#L1
Black format check
--- /home/runner/work/flair/flair/flair/datasets/sequence_labeling.py 2024-10-14 13:16:36.357280+00:00
+++ /home/runner/work/flair/flair/flair/datasets/sequence_labeling.py 2024-10-14 13:19:09.671453+00:00
@@ -1,11 +1,12 @@
import copy
import json
import logging
import os
import re
-#import shutil
+
+# import shutil
from collections import defaultdict
from pathlib import Path
import tempfile
import shutil
import requests
@@ -1428,17 +1429,16 @@
in_memory=in_memory,
**corpusargs,
)
-
class CLEANCONLL(ColumnCorpus):
def __init__(
- self,
- base_path: Optional[Union[str, Path]] = None,
- in_memory: bool = True,
- **corpusargs,
+ self,
+ base_path: Optional[Union[str, Path]] = None,
+ in_memory: bool = True,
+ **corpusargs,
) -> None:
"""
Initialize the CleanCoNLL corpus.
Args:
@@ -1447,24 +1447,20 @@
"""
# Set the base path for the dataset
base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
# Define column format
- columns = {0: "text",
- 1: "pos",
- 2: "nel",
- 3: "ner*",
- 4: "ner"}
+ columns = {0: "text", 1: "pos", 2: "nel", 3: "ner*", 4: "ner"}
# Define dataset name
dataset_name = self.__class__.__name__.lower()
# Define data folder path
data_folder = base_path / dataset_name
# Check if the train data file exists, otherwise download and prepare the dataset
- train_set = data_folder / 'cleanconll.train'
+ train_set = data_folder / "cleanconll.train"
if not train_set.exists():
print("CleanCoNLL files not found, so downloading and creating them.")
# Download and prepare the dataset
@@ -1492,31 +1488,31 @@
"""
changes = []
current_change = None
- with open(patch_file_path, 'r') as patch_file:
+ with open(patch_file_path, "r") as patch_file:
for line in patch_file:
# Check if the line is a change, delete or add command (like 17721c17703,17705 or 5728d5727)
- if line and (line[0].isdigit() and ('c' in line or 'd' in line or 'a' in line)):
+ if line and (line[0].isdigit() and ("c" in line or "d" in line or "a" in line)):
if current_change:
# Append the previous change block to the changes list
changes.append(current_change)
# Start a new change block
- current_change = {'command': line, 'original': [], 'new': []}
+ current_change = {"command": line, "original": [], "new": []}
# Capture original lines (those marked with "<")
- elif line.startswith('<'):
+ elif line.startswith("<"):
if current_change:
- current_change['original'].append(line[2:]) # Remove the "< " part
+ current_change["original"].append(line[2:]) # Remove the "< " part
# Capture new lines (those marked with ">")
- elif line.startswith('>'):
+ elif line.startswith(">"):
if current_change:
- current_change['new'].append(line[2:]) # Remove the "> " part
+ current_change["new"].append(line[2:]) # Remove the "> " part
# Append the last change block to the changes list
if current_change:
changes.append(current_change)
@@ -1524,96 +1520,96 @@
def parse_line_range(line_range_str):
"""
Utility function to parse a line range string like '17703,17705' or '5727' and returns a tupl
|
Run tests:
flair/datasets/sequence_labeling.py#L1
flair/datasets/sequence_labeling.py
1471: error: Too many arguments for "download_and_prepare_data" of "CLEANCONLL" [call-arg]
|
Run tests
Process completed with exit code 1.
|
The following actions use a deprecated Node.js version and will be forced to run on node20: actions/checkout@v3, actions/setup-python@v4, actions/cache@v3. For more info: https://github.blog/changelog/2024-03-07-github-actions-all-actions-will-run-on-node20-instead-of-node16-by-default/
|
Loading