entity_embed.data_utils package

Submodules

entity_embed.data_utils.attr_config_parser module

class entity_embed.data_utils.field_config_parser.FieldConfigDictParser

Bases: object

classmethod from_dict(field_config_dict, record_list)
classmethod from_json(field_config_json_file_obj, record_list)

entity_embed.data_utils.datasets module

class entity_embed.data_utils.datasets.ClusterDataset(record_dict, cluster_field, record_numericalizer, batch_size, max_cluster_size_in_batch, random_seed)

Bases: Generic[torch.utils.data.dataset.T_co]

class entity_embed.data_utils.datasets.RecordDataset(record_dict, record_numericalizer, batch_size)

Bases: Generic[torch.utils.data.dataset.T_co]

entity_embed.data_utils.numericalizer module

class entity_embed.data_utils.numericalizer.FieldConfig(key: str, field_type: entity_embed.data_utils.numericalizer.FieldType, tokenizer: Callable[[str], List[str]], alphabet: List[str], max_str_len: int, vocab: torchtext.vocab.Vocab, n_channels: int, embed_dropout_p: float, use_attention: bool)

Bases: object

alphabet: List[str]
embed_dropout_p: float
field_type: entity_embed.data_utils.numericalizer.FieldType
property is_multitoken
key: str
max_str_len: int
n_channels: int
tokenizer: Callable[[str], List[str]]
use_attention: bool
vocab: torchtext.vocab.Vocab
class entity_embed.data_utils.numericalizer.FieldType(value)

Bases: enum.Enum

An enumeration.

MULTITOKEN = 'multitoken'
SEMANTIC_MULTITOKEN = 'semantic_multitoken'
SEMANTIC_STRING = 'semantic_string'
STRING = 'string'
class entity_embed.data_utils.numericalizer.MultitokenNumericalizer(field, field_config)

Bases: object

build_tensor(val)
is_multitoken = True
class entity_embed.data_utils.numericalizer.RecordNumericalizer(field_config_dict, field_to_numericalizer)

Bases: object

build_tensor_dict(record)
class entity_embed.data_utils.numericalizer.SemanticMultitokenNumericalizer(field, field_config)

Bases: entity_embed.data_utils.numericalizer.MultitokenNumericalizer

class entity_embed.data_utils.numericalizer.SemanticStringNumericalizer(field, field_config)

Bases: object

build_tensor(val)
is_multitoken = False
class entity_embed.data_utils.numericalizer.StringNumericalizer(field, field_config)

Bases: object

build_tensor(val)
is_multitoken = False
entity_embed.data_utils.numericalizer.default_tokenizer(val)

entity_embed.data_utils.union_find module

class entity_embed.data_utils.union_find.UnionFind

Bases: object

component_dict()
find(obj)
union(*objs)
union_pairs(pair_gen)

entity_embed.data_utils.utils module

entity_embed.data_utils.utils.Enumerator(start=0, initial=())
entity_embed.data_utils.utils.assign_clusters(record_dict, cluster_field, cluster_mapping)
entity_embed.data_utils.utils.cluster_dict_to_id_pairs(cluster_dict, left_id_set=None, right_id_set=None)
entity_embed.data_utils.utils.compute_max_str_len(field_val_gen, is_multitoken, tokenizer)
entity_embed.data_utils.utils.compute_vocab_counter(field_val_gen, tokenizer)
entity_embed.data_utils.utils.count_cluster_dict_pairs(cluster_dict)
entity_embed.data_utils.utils.id_pairs_to_cluster_mapping_and_dict(id_pairs, record_dict)
entity_embed.data_utils.utils.record_dict_to_cluster_dict(record_dict, cluster_field)
entity_embed.data_utils.utils.record_dict_to_left_right_id_set(record_dict, source_field, left_source)
entity_embed.data_utils.utils.split_clusters(cluster_dict, train_proportion, valid_proportion, random_seed)
entity_embed.data_utils.utils.split_record_dict_on_clusters(record_dict, cluster_field, train_proportion, valid_proportion, random_seed)
entity_embed.data_utils.utils.subdict(d, keys)
entity_embed.data_utils.utils.tensor_dict_to_device(tensor_dict, device)

Module contents