Add CleanCoNLL object · flairNLP/flair@53dfd10

Black format check --- /home/runner/work/flair/flair/flair/datasets/sequence_labeling.py 2024-10-14 13:53:15.262883+00:00 +++ /home/runner/work/flair/flair/flair/datasets/sequence_labeling.py 2024-10-14 13:55:42.533954+00:00 @@ -1,11 +1,12 @@ import copy import json import logging import os import re -#import shutil + +# import shutil from collections import defaultdict from pathlib import Path import tempfile import shutil import requests @@ -1428,17 +1429,16 @@ in_memory=in_memory, **corpusargs, ) - class CLEANCONLL(ColumnCorpus): def __init__( - self, - base_path: Optional[Union[str, Path]] = None, - in_memory: bool = True, - **corpusargs, + self, + base_path: Optional[Union[str, Path]] = None, + in_memory: bool = True, + **corpusargs, ) -> None: """ Initialize the CleanCoNLL corpus. Args: @@ -1447,24 +1447,20 @@ """ # Set the base path for the dataset base_path = flair.cache_root / "datasets" if not base_path else Path(base_path) # Define column format - columns = {0: "text", - 1: "pos", - 2: "nel", - 3: "ner*", - 4: "ner"} + columns = {0: "text", 1: "pos", 2: "nel", 3: "ner*", 4: "ner"} # Define dataset name dataset_name = self.__class__.__name__.lower() # Define data folder path data_folder = base_path / dataset_name # Check if the train data file exists, otherwise download and prepare the dataset - train_set = data_folder / 'cleanconll.train' + train_set = data_folder / "cleanconll.train" if not train_set.exists(): print("CleanCoNLL files not found, so downloading and creating them.") # Download and prepare the dataset @@ -1492,31 +1488,31 @@ """ changes = [] current_change = None - with open(patch_file_path, 'r') as patch_file: + with open(patch_file_path, "r") as patch_file: for line in patch_file: # Check if the line is a change, delete or add command (like 17721c17703,17705 or 5728d5727) - if line and (line[0].isdigit() and ('c' in line or 'd' in line or 'a' in line)): + if line and (line[0].isdigit() and ("c" in line or "d" in line or "a" in line)): if current_change: # Append the previous change block to the changes list changes.append(current_change) # Start a new change block - current_change = {'command': line, 'original': [], 'new': []} + current_change = {"command": line, "original": [], "new": []} # Capture original lines (those marked with "<") - elif line.startswith('<'): + elif line.startswith("<"): if current_change: - current_change['original'].append(line[2:]) # Remove the "< " part + current_change["original"].append(line[2:]) # Remove the "< " part # Capture new lines (those marked with ">") - elif line.startswith('>'): + elif line.startswith(">"): if current_change: - current_change['new'].append(line[2:]) # Remove the "> " part + current_change["new"].append(line[2:]) # Remove the "> " part # Append the last change block to the changes list if current_change: changes.append(current_change) @@ -1524,96 +1520,96 @@ def parse_line_range(line_range_str): """ Utility function to parse a line range string like '17703,17705' or '5727' and returns a tupl

test: flair/datasets/sequence_labeling.py#L1

flair/datasets/sequence_labeling.py 1471: error: Too many arguments for "download_and_prepare_data" of "CLEANCONLL" [call-arg]

test

Process completed with exit code 1.

test

The following actions use a deprecated Node.js version and will be forced to run on node20: actions/checkout@v3, actions/setup-python@v4, actions/cache@v3. For more info: https://github.blog/changelog/2024-03-07-github-actions-all-actions-will-run-on-node20-instead-of-node16-by-default/

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add CleanCoNLL object #2421

Summary

Add CleanCoNLL object #2421

Jobs

Run details

ci.yml

Annotations