flair.datasets.sequence_labeling#

class flair.datasets.sequence_labeling.MultiFileJsonlCorpus(train_files=None, test_files=None, dev_files=None, encoding='utf-8', text_column_name='data', label_column_name='label', metadata_column_name='metadata', label_type='ner', **corpusargs)View on GitHub#

Bases: Corpus

This class represents a generic Jsonl corpus with multiple train, dev, and test files.

class flair.datasets.sequence_labeling.JsonlCorpus(data_folder, train_file=None, test_file=None, dev_file=None, encoding='utf-8', text_column_name='data', label_column_name='label', metadata_column_name='metadata', label_type='ner', autofind_splits=True, name=None, **corpusargs)View on GitHub#

Bases: MultiFileJsonlCorpus

class flair.datasets.sequence_labeling.JsonlDataset(path_to_jsonl_file, encoding='utf-8', text_column_name='data', label_column_name='label', metadata_column_name='metadata', label_type='ner')View on GitHub#

Bases: FlairDataset

is_in_memory()View on GitHub#
Return type:

bool

class flair.datasets.sequence_labeling.MultiFileColumnCorpus(column_format, train_files=None, test_files=None, dev_files=None, column_delimiter='\\\\s+', comment_symbol=None, encoding='utf-8', document_separator_token=None, skip_first_line=False, in_memory=True, label_name_map=None, banned_sentences=None, default_whitespace_after=1, **corpusargs)View on GitHub#

Bases: Corpus

class flair.datasets.sequence_labeling.ColumnCorpus(data_folder, column_format, train_file=None, test_file=None, dev_file=None, autofind_splits=True, name=None, comment_symbol='# ', **corpusargs)View on GitHub#

Bases: MultiFileColumnCorpus

class flair.datasets.sequence_labeling.ColumnDataset(path_to_column_file, column_name_map, column_delimiter='\\\\s+', comment_symbol=None, banned_sentences=None, in_memory=True, document_separator_token=None, encoding='utf-8', skip_first_line=False, label_name_map=None, default_whitespace_after=1)View on GitHub#

Bases: FlairDataset

SPACE_AFTER_KEY = 'space-after'#
FEATS = ['feats', 'misc']#
HEAD = ['head', 'head_id']#
is_in_memory()View on GitHub#
Return type:

bool

class flair.datasets.sequence_labeling.ONTONOTES(base_path=None, version='v4', language='english', domain=None, in_memory=True, **corpusargs)View on GitHub#

Bases: MultiFileColumnCorpus

archive_url = 'https://data.mendeley.com/public-files/datasets/zmycy7t9h9/files/b078e1c4-f7a4-4427-be7f-9389967831ef/file_downloaded'#
classmethod get_available_domains(base_path=None, version='v4', language='english', split='train')View on GitHub#
Return type:

List[str]

classmethod dataset_document_iterator(file_path)View on GitHub#

An iterator over CONLL formatted files which yields documents, regardless of the number of document annotations in a particular file.

This is useful for conll data which has been preprocessed, such as the preprocessing which takes place for the 2012 CONLL Coreference Resolution task.

Return type:

Iterator[List]

classmethod sentence_iterator(file_path)View on GitHub#

An iterator over the sentences in an individual CONLL formatted file.

Return type:

Iterator

class flair.datasets.sequence_labeling.CONLL_03(base_path=None, column_format={0: 'text', 1: 'pos', 3: 'ner'}, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.CONLL_03_GERMAN(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.CONLL_03_DUTCH(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.CONLL_03_SPANISH(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.CONLL_2000(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.WNUT_17(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.FEWNERD(setting='supervised', **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.BIOSCOPE(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ARABIC_ANER(base_path=None, in_memory=True, document_as_sequence=False, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ARABIC_AQMAR(base_path=None, in_memory=True, document_as_sequence=False, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_BASQUE(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_CHINESE_WEIBO(base_path=None, in_memory=True, document_as_sequence=False, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_DANISH_DANE(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ENGLISH_MOVIE_SIMPLE(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ENGLISH_MOVIE_COMPLEX(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ENGLISH_SEC_FILLINGS(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ENGLISH_RESTAURANT(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ENGLISH_STACKOVERFLOW(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ENGLISH_TWITTER(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ENGLISH_PERSON(base_path=None, in_memory=True)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ENGLISH_WEBPAGES(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ENGLISH_WNUT_2020(base_path=None, in_memory=True, document_as_sequence=False, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ENGLISH_WIKIGOLD(base_path=None, in_memory=True, document_as_sequence=False, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_FINNISH(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_GERMAN_BIOFID(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_GERMAN_EUROPARL(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_GERMAN_GERMEVAL(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_GERMAN_POLITICS(base_path=None, column_delimiter='\\\\s+', in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_HUNGARIAN(base_path=None, in_memory=True, document_as_sequence=False, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ICELANDIC(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_JAPANESE(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_MASAKHANE(languages='luo', version='v2', base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: MultiCorpus

class flair.datasets.sequence_labeling.NER_MULTI_CONER(task='multi', base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: MultiFileColumnCorpus

class flair.datasets.sequence_labeling.NER_MULTI_CONER_V2(task='multi', base_path=None, in_memory=True, use_dev_as_test=True, **corpusargs)View on GitHub#

Bases: MultiFileColumnCorpus

class flair.datasets.sequence_labeling.NER_MULTI_WIKIANN(languages='en', base_path=None, in_memory=False, **corpusargs)View on GitHub#

Bases: MultiCorpus

class flair.datasets.sequence_labeling.NER_MULTI_XTREME(languages='en', base_path=None, in_memory=False, **corpusargs)View on GitHub#

Bases: MultiCorpus

class flair.datasets.sequence_labeling.NER_MULTI_WIKINER(languages='en', base_path=None, in_memory=False, **corpusargs)View on GitHub#

Bases: MultiCorpus

class flair.datasets.sequence_labeling.NER_SWEDISH(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_TURKU(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_UKRAINIAN(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.KEYPHRASE_SEMEVAL2017(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.KEYPHRASE_INSPEC(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.KEYPHRASE_SEMEVAL2010(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.UP_CHINESE(base_path=None, in_memory=True, document_as_sequence=False, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.UP_ENGLISH(base_path=None, in_memory=True, document_as_sequence=False, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.UP_FRENCH(base_path=None, in_memory=True, document_as_sequence=False, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.UP_FINNISH(base_path=None, in_memory=True, document_as_sequence=False, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.UP_GERMAN(base_path=None, in_memory=True, document_as_sequence=False, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.UP_ITALIAN(base_path=None, in_memory=True, document_as_sequence=False, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.UP_SPANISH(base_path=None, in_memory=True, document_as_sequence=False, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.UP_SPANISH_ANCORA(base_path=None, in_memory=True, document_as_sequence=False, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_HIPE_2022(dataset_name, language, base_path=None, in_memory=True, version='v2.1', branch_name='main', dev_split_name='dev', add_document_separator=False, sample_missing_splits=False, preproc_fn=None, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ICDAR_EUROPEANA(language, base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_NERMUD(domains='all', base_path=None, in_memory=False, **corpusargs)View on GitHub#

Bases: MultiCorpus

class flair.datasets.sequence_labeling.NER_GERMAN_MOBIE(base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

class flair.datasets.sequence_labeling.NER_ESTONIAN_NOISY(version=0, base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: ColumnCorpus

data_url = 'https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/patnlp/estner.cnll.zip'#
label_url = 'https://raw.githubusercontent.com/uds-lsv/NoisyNER/master/data/only_labels'#
class flair.datasets.sequence_labeling.MASAKHA_POS(languages='bam', version='v1', base_path=None, in_memory=True, **corpusargs)View on GitHub#

Bases: MultiCorpus