Skip to content

Add CleanCoNLL object #2421

Add CleanCoNLL object

Add CleanCoNLL object #2421

Triggered via pull request October 14, 2024 13:53
Status Failure
Total duration 18m 12s
Artifacts

ci.yml

on: pull_request
Fit to window
Zoom out
Zoom in

Annotations

6 errors and 1 warning
test: flair/__init__.py#L1
mypy-status mypy exited with status 1.
test: flair/datasets/__init__.py#L341
ruff pytest_ruff.RuffError: flair/datasets/__init__.py:2:1: I001 [*] Import block is un-sorted or un-formatted | 1 | # Expose base classses 2 | / from .base import ( 3 | | DataLoader, 4 | | FlairDatapointDataset, 5 | | MongoDataset, 6 | | SentenceDataset, 7 | | StringDataset, 8 | | ) 9 | | 10 | | # Expose all biomedical data sets used for the evaluation of BioBERT 11 | | # - 12 | | # - 13 | | # - 14 | | # - 15 | | # Expose all biomedical data sets using the HUNER splits 16 | | # Expose all biomedical data sets 17 | | from .biomedical import ( 18 | | ANAT_EM, 19 | | AZDZ, 20 | | BC2GM, 21 | | BIO_INFER, 22 | | BIOBERT_CHEMICAL_BC4CHEMD, 23 | | BIOBERT_CHEMICAL_BC5CDR, 24 | | BIOBERT_DISEASE_BC5CDR, 25 | | BIOBERT_DISEASE_NCBI, 26 | | BIOBERT_GENE_BC2GM, 27 | | BIOBERT_GENE_JNLPBA, 28 | | BIOBERT_SPECIES_LINNAEUS, 29 | | BIOBERT_SPECIES_S800, 30 | | BIONLP2013_CG, 31 | | BIONLP2013_PC, 32 | | BIOSEMANTICS, 33 | | CDR, 34 | | CELL_FINDER, 35 | | CEMP, 36 | | CHEMDNER, 37 | | CLL, 38 | | CRAFT, 39 | | CRAFT_V4, 40 | | DECA, 41 | | FSU, 42 | | GELLUS, 43 | | GPRO, 44 | | HUNER_CELL_LINE, 45 | | HUNER_CELL_LINE_CELL_FINDER, 46 | | HUNER_CELL_LINE_CLL, 47 | | HUNER_CELL_LINE_GELLUS, 48 | | HUNER_CELL_LINE_JNLPBA, 49 | | HUNER_CHEMICAL, 50 | | HUNER_CHEMICAL_CDR, 51 | | HUNER_CHEMICAL_CEMP, 52 | | HUNER_CHEMICAL_CHEBI, 53 | | HUNER_CHEMICAL_CHEMDNER, 54 | | HUNER_CHEMICAL_CRAFT_V4, 55 | | HUNER_CHEMICAL_SCAI, 56 | | HUNER_DISEASE, 57 | | HUNER_DISEASE_CDR, 58 | | HUNER_DISEASE_MIRNA, 59 | | HUNER_DISEASE_NCBI, 60 | | HUNER_DISEASE_PDR, 61 | | HUNER_DISEASE_SCAI, 62 | | HUNER_DISEASE_VARIOME, 63 | | HUNER_GENE, 64 | | HUNER_GENE_BC2GM, 65 | | HUNER_GENE_BIO_INFER, 66 | | HUNER_GENE_CELL_FINDER, 67 | | HUNER_GENE_CHEBI, 68 | | HUNER_GENE_CRAFT_V4, 69 | | HUNER_GENE_DECA, 70 | | HUNER_GENE_FSU, 71 | | HUNER_GENE_GPRO, 72 | | HUNER_GENE_IEPA, 73 | | HUNER_GENE_JNLPBA, 74 | | HUNER_GENE_LOCTEXT, 75 | | HUNER_GENE_MIRNA, 76 | | HUNER_GENE_OSIRIS, 77 | | HUNER_GENE_VARIOME, 78 | | HUNER_SPECIES, 79 | | HUNER_SPECIES_CELL_FINDER, 80 | | HUNER_SPECIES_CHEBI, 81 | | HUNER_SPECIES_CRAFT_V4, 82 | | HUNER_SPECIES_LINNEAUS, 83 | | HUNER_SPECIES_LOCTEXT, 84 | | HUNER_SPECIES_MIRNA, 85 | | HUNER_SPECIES_S800, 86 | | HUNER_SPECIES_VARIOME, 87 | | IEPA, 88 | | JNLPBA, 89 | | LINNEAUS, 90 | | LOCTEXT, 91 | | MIRNA, 92 | | NCBI_DISEASE, 93 | | OSIRIS, 94 | | PDR, 95 | | S800, 96 | | SCAI_CHEMICALS, 97 | | SCAI_DISEASE, 98 | | VARIOME, 99 | | ) 100 | | 101 | | # Expose all document classification datasets 102 | | from .document_classification import ( 103 | | AGNEWS, 104 | | AMAZON_REVIEWS, 105 | | COMMUNICATIVE_FUNCTIONS, 106 | | GERMEVAL_2018_OFFENSIVE_LANGUAGE, 107 | | GLUE_COLA, 108 | | GLUE_SST2, 109 | | GO_EMOTIONS, 110 | | IMDB, 111 | | NEWSGROUPS, 112 | | SENTEVAL_CR, 113 | | SENTEVAL_MPQA, 114 | | SENTEVAL_MR, 115 | | SENTEVAL_SST_BINARY, 116 | | SENTEVAL_SST_GRANULAR, 117 | | SENTEVAL_SUBJ, 118 | | SENTIMENT_140, 119 | | STACKOVERFLOW, 120 | | TREC_6, 121 | | TREC_50, 122 | | WASSA_ANGER, 123 | | WASSA_FEAR, 124 | | WASSA_JOY, 125 | | WASSA_SADNESS, 126 | | YAHOO_ANSWERS, 127 | | ClassificationCorpus, 128 | | ClassificationDataset, 129 | | CSVClassificationCorpus, 130 | | CSVClassificationDataset, 131 | | ) 132 | | 133 | | # word sense disambiguation 134 | | # Expose all entity linking datasets 135 | | from .entity_linking import ( 136 | | CTD_CHEMICALS_DICTIONARY, 137 | | CTD_DISEASES_DICTIONARY, 138 | | NCBI_GENE_HUMAN_DICTIONARY, 139 | | NCBI_TAXONOMY_DICTIO
test: flair/datasets/sequence_labeling.py#L341
ruff pytest_ruff.RuffError: flair/datasets/sequence_labeling.py:1:1: I001 [*] Import block is un-sorted or un-formatted | 1 | / import copy 2 | | import json 3 | | import logging 4 | | import os 5 | | import re 6 | | #import shutil 7 | | from collections import defaultdict 8 | | from pathlib import Path 9 | | import tempfile 10 | | import shutil 11 | | import requests 12 | | import zipfile 13 | | import subprocess 14 | | from typing import ( 15 | | Any, 16 | | DefaultDict, 17 | | Dict, 18 | | Iterable, 19 | | Iterator, 20 | | List, 21 | | Optional, 22 | | Tuple, 23 | | Union, 24 | | cast, 25 | | ) 26 | | 27 | | from torch.utils.data import ConcatDataset, Dataset 28 | | 29 | | import flair 30 | | from flair.data import ( 31 | | Corpus, 32 | | FlairDataset, 33 | | MultiCorpus, 34 | | Relation, 35 | | Sentence, 36 | | Token, 37 | | get_spans_from_bio, 38 | | ) 39 | | from flair.datasets.base import find_train_dev_test_files 40 | | from flair.file_utils import cached_path, unpack_file 41 | | from flair.tokenization import Tokenizer 42 | | 43 | | log = logging.getLogger("flair") | |_^ I001 | = help: Organize imports flair/datasets/sequence_labeling.py:13:8: F401 [*] `subprocess` imported but unused | 11 | import requests 12 | import zipfile 13 | import subprocess | ^^^^^^^^^^ F401 14 | from typing import ( 15 | Any, | = help: Remove unused import: `subprocess` flair/datasets/sequence_labeling.py:1441:9: D212 [*] Multi-line docstring summary should start at the first line | 1439 | **corpusargs, 1440 | ) -> None: 1441 | """ | _________^ 1442 | | Initialize the CleanCoNLL corpus. 1443 | | 1444 | | Args: 1445 | | base_path: Base directory for the dataset. If None, defaults to flair.cache_root / "datasets". 1446 | | in_memory: If True, keeps dataset in memory for faster training. 1447 | | """ | |___________^ D212 1448 | # Set the base path for the dataset 1449 | base_path = flair.cache_root / "datasets" if not base_path else Path(base_path) | = help: Remove whitespace after opening quotes flair/datasets/sequence_labeling.py:1465:35: Q000 [*] Single quotes found but double quotes preferred | 1464 | # Check if the train data file exists, otherwise download and prepare the dataset 1465 | train_set = data_folder / 'cleanconll.train' | ^^^^^^^^^^^^^^^^^^ Q000 1466 | 1467 | if not train_set.exists(): | = help: Replace single quotes with double quotes flair/datasets/sequence_labeling.py:1490:13: D200 One-line docstring should fit on one line | 1489 | def parse_patch(patch_file_path): 1490 | """ | _____________^ 1491 | | Parses a patch file and returns a structured representation of the changes. 1492 | | """ | |_______________^ D200 1493 | 1494 | changes = [] | = help: Reformat to one line flair/datasets/sequence_labeling.py:1490:13: D202 [*] No blank lines allowed after function docstring (found 1) | 1489 | def parse_patch(patch_file_path): 1490 | """ | _____________^ 1491 | | Parses a patch file and returns a structured representation of the changes. 1492 | | """ | |_______________^ D202 1493 | 1494 | changes = [] | = help: Remove blank line(s) after function docstring flair/datasets/sequence_labeling.py:1490:13: D212 [*] Multi-line docstring summary should start at the first line | 1489 | def parse_patch(patch_file_path): 1490 | """ | _____________^ 1491 | | Parses a patch file and returns a structured representation of the changes. 1492 | | """ | |_______________^ D212 1493 | 1494 | changes = [] | = help: Remove whitespace after opening quotes fl
test: flair/datasets/sequence_labeling.py#L1
Black format check --- /home/runner/work/flair/flair/flair/datasets/sequence_labeling.py 2024-10-14 13:53:15.262883+00:00 +++ /home/runner/work/flair/flair/flair/datasets/sequence_labeling.py 2024-10-14 13:55:42.533954+00:00 @@ -1,11 +1,12 @@ import copy import json import logging import os import re -#import shutil + +# import shutil from collections import defaultdict from pathlib import Path import tempfile import shutil import requests @@ -1428,17 +1429,16 @@ in_memory=in_memory, **corpusargs, ) - class CLEANCONLL(ColumnCorpus): def __init__( - self, - base_path: Optional[Union[str, Path]] = None, - in_memory: bool = True, - **corpusargs, + self, + base_path: Optional[Union[str, Path]] = None, + in_memory: bool = True, + **corpusargs, ) -> None: """ Initialize the CleanCoNLL corpus. Args: @@ -1447,24 +1447,20 @@ """ # Set the base path for the dataset base_path = flair.cache_root / "datasets" if not base_path else Path(base_path) # Define column format - columns = {0: "text", - 1: "pos", - 2: "nel", - 3: "ner*", - 4: "ner"} + columns = {0: "text", 1: "pos", 2: "nel", 3: "ner*", 4: "ner"} # Define dataset name dataset_name = self.__class__.__name__.lower() # Define data folder path data_folder = base_path / dataset_name # Check if the train data file exists, otherwise download and prepare the dataset - train_set = data_folder / 'cleanconll.train' + train_set = data_folder / "cleanconll.train" if not train_set.exists(): print("CleanCoNLL files not found, so downloading and creating them.") # Download and prepare the dataset @@ -1492,31 +1488,31 @@ """ changes = [] current_change = None - with open(patch_file_path, 'r') as patch_file: + with open(patch_file_path, "r") as patch_file: for line in patch_file: # Check if the line is a change, delete or add command (like 17721c17703,17705 or 5728d5727) - if line and (line[0].isdigit() and ('c' in line or 'd' in line or 'a' in line)): + if line and (line[0].isdigit() and ("c" in line or "d" in line or "a" in line)): if current_change: # Append the previous change block to the changes list changes.append(current_change) # Start a new change block - current_change = {'command': line, 'original': [], 'new': []} + current_change = {"command": line, "original": [], "new": []} # Capture original lines (those marked with "<") - elif line.startswith('<'): + elif line.startswith("<"): if current_change: - current_change['original'].append(line[2:]) # Remove the "< " part + current_change["original"].append(line[2:]) # Remove the "< " part # Capture new lines (those marked with ">") - elif line.startswith('>'): + elif line.startswith(">"): if current_change: - current_change['new'].append(line[2:]) # Remove the "> " part + current_change["new"].append(line[2:]) # Remove the "> " part # Append the last change block to the changes list if current_change: changes.append(current_change) @@ -1524,96 +1520,96 @@ def parse_line_range(line_range_str): """ Utility function to parse a line range string like '17703,17705' or '5727' and returns a tupl
test: flair/datasets/sequence_labeling.py#L1
flair/datasets/sequence_labeling.py 1471: error: Too many arguments for "download_and_prepare_data" of "CLEANCONLL" [call-arg]
test
Process completed with exit code 1.
test
The following actions use a deprecated Node.js version and will be forced to run on node20: actions/checkout@v3, actions/setup-python@v4, actions/cache@v3. For more info: https://github.blog/changelog/2024-03-07-github-actions-all-actions-will-run-on-node20-instead-of-node16-by-default/