entity_embed.data_utils package¶
Submodules¶
entity_embed.data_utils.attr_config_parser module¶
entity_embed.data_utils.datasets module¶
-
class
entity_embed.data_utils.datasets.ClusterDataset(record_dict, cluster_field, record_numericalizer, batch_size, max_cluster_size_in_batch, random_seed)¶ Bases:
Generic[torch.utils.data.dataset.T_co]
-
class
entity_embed.data_utils.datasets.RecordDataset(record_dict, record_numericalizer, batch_size)¶ Bases:
Generic[torch.utils.data.dataset.T_co]
entity_embed.data_utils.numericalizer module¶
-
class
entity_embed.data_utils.numericalizer.FieldConfig(key: str, field_type: entity_embed.data_utils.numericalizer.FieldType, tokenizer: Callable[[str], List[str]], alphabet: List[str], max_str_len: int, vocab: torchtext.vocab.Vocab, n_channels: int, embed_dropout_p: float, use_attention: bool)¶ Bases:
object-
alphabet: List[str]¶
-
embed_dropout_p: float¶
-
field_type: entity_embed.data_utils.numericalizer.FieldType¶
-
property
is_multitoken¶
-
key: str¶
-
max_str_len: int¶
-
n_channels: int¶
-
tokenizer: Callable[[str], List[str]]¶
-
use_attention: bool¶
-
vocab: torchtext.vocab.Vocab¶
-
-
class
entity_embed.data_utils.numericalizer.FieldType(value)¶ Bases:
enum.EnumAn enumeration.
-
MULTITOKEN= 'multitoken'¶
-
SEMANTIC_MULTITOKEN= 'semantic_multitoken'¶
-
SEMANTIC_STRING= 'semantic_string'¶
-
STRING= 'string'¶
-
-
class
entity_embed.data_utils.numericalizer.MultitokenNumericalizer(field, field_config)¶ Bases:
object-
build_tensor(val)¶
-
is_multitoken= True¶
-
-
class
entity_embed.data_utils.numericalizer.RecordNumericalizer(field_config_dict, field_to_numericalizer)¶ Bases:
object-
build_tensor_dict(record)¶
-
-
class
entity_embed.data_utils.numericalizer.SemanticMultitokenNumericalizer(field, field_config)¶ Bases:
entity_embed.data_utils.numericalizer.MultitokenNumericalizer
-
class
entity_embed.data_utils.numericalizer.SemanticStringNumericalizer(field, field_config)¶ Bases:
object-
build_tensor(val)¶
-
is_multitoken= False¶
-
-
class
entity_embed.data_utils.numericalizer.StringNumericalizer(field, field_config)¶ Bases:
object-
build_tensor(val)¶
-
is_multitoken= False¶
-
-
entity_embed.data_utils.numericalizer.default_tokenizer(val)¶
entity_embed.data_utils.union_find module¶
entity_embed.data_utils.utils module¶
-
entity_embed.data_utils.utils.Enumerator(start=0, initial=())¶
-
entity_embed.data_utils.utils.assign_clusters(record_dict, cluster_field, cluster_mapping)¶
-
entity_embed.data_utils.utils.cluster_dict_to_id_pairs(cluster_dict, left_id_set=None, right_id_set=None)¶
-
entity_embed.data_utils.utils.compute_max_str_len(field_val_gen, is_multitoken, tokenizer)¶
-
entity_embed.data_utils.utils.compute_vocab_counter(field_val_gen, tokenizer)¶
-
entity_embed.data_utils.utils.count_cluster_dict_pairs(cluster_dict)¶
-
entity_embed.data_utils.utils.id_pairs_to_cluster_mapping_and_dict(id_pairs, record_dict)¶
-
entity_embed.data_utils.utils.record_dict_to_cluster_dict(record_dict, cluster_field)¶
-
entity_embed.data_utils.utils.record_dict_to_left_right_id_set(record_dict, source_field, left_source)¶
-
entity_embed.data_utils.utils.split_clusters(cluster_dict, train_proportion, valid_proportion, random_seed)¶
-
entity_embed.data_utils.utils.split_record_dict_on_clusters(record_dict, cluster_field, train_proportion, valid_proportion, random_seed)¶
-
entity_embed.data_utils.utils.subdict(d, keys)¶
-
entity_embed.data_utils.utils.tensor_dict_to_device(tensor_dict, device)¶