"""Text utility script."""
import os
import random
from typing import Dict
from typing import List
from typing import Sequence
import numpy as np
import torch
from sklearn.utils.murmurhash import murmurhash3_32
_dtypes_mapping = {
"label": "float",
"cat": "long",
"cont": "float",
"weight": "float",
"input_ids": "long",
"attention_mask": "long",
"token_type_ids": "long",
"text": "float", # embeddings
"length": "long",
}
def inv_sigmoid(x: np.ndarray) -> np.ndarray:
"""Inverse sigmoid transformation.
Args:
x: Input array.
Returns:
Transformed array.
"""
return np.log(x / (1 - x))
def inv_softmax(x: np.ndarray) -> np.ndarray:
"""Variant of inverse softmax transformation with zero constant term.
Args:
x: Input array.
Returns:
Transformed array.
"""
eps = 1e-7
x = np.abs(x)
arr = (x + eps) / (np.sum(x) + eps)
arr = np.log(arr)
return arr
def is_shuffle(stage: str) -> bool:
"""Whether shuffle input.
Args:
stage: Train, val, test.
Returns:
Bool value.
"""
is_sh = {"train": True, "val": False, "test": False}
return is_sh[stage]
[docs]def seed_everything(seed: int = 42, deterministic: bool = True):
"""Set random seed and cudnn params.
Args:
seed: Random state.
deterministic: cudnn backend.
"""
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
if deterministic:
torch.backends.cudnn.deterministic = True
[docs]def parse_devices(dvs, is_dp: bool = False) -> tuple:
"""Parse devices and convert first to the torch device.
Args:
dvs: List, string with device ids or torch.device.
is_dp: Use data parallel - additionally returns device ids.
Returns:
First torch device and list of gpu ids.
"""
device = []
ids = []
if (not torch.cuda.is_available()) or (dvs is None):
return torch.device("cpu"), None
if not isinstance(dvs, (list, tuple)):
dvs = [dvs]
for _device in dvs:
if isinstance(_device, str):
if _device.startswith("cuda:"):
ids.append(int(_device.split("cuda:")[-1]))
elif _device == "cuda":
ids.append(0)
elif _device == "cpu":
return torch.device("cpu"), None
else:
ids.append(int(_device))
_device = torch.device(int(_device))
elif isinstance(_device, int):
ids.append(_device)
_device = torch.device("cuda:{}".format(_device))
elif isinstance(_device, torch.device):
if _device.type == "cpu":
return _device, None
else:
if _device.index is None:
ids.append(0)
else:
ids.append(_device.index)
else:
raise ValueError("Unknown device type: {}".format(_device))
device.append(_device)
return device[0], ids if (len(device) > 1) and is_dp else None
[docs]def custom_collate(batch: List[np.ndarray]) -> torch.Tensor:
"""Puts each data field into a tensor with outer dimension batch size."""
elem = batch[0]
if isinstance(elem, torch.Tensor):
out = None
numel = sum([x.numel() for x in batch])
storage = elem.storage()._new_shared(numel)
out = elem.new(storage)
return torch.stack(batch, 0, out=out)
else:
return torch.from_numpy(np.array(batch)).float()
def collate_dict(batch: List[Dict[str, np.ndarray]]) -> Dict[str, torch.Tensor]:
"""custom_collate for dicts."""
keys = list(batch[0].keys())
transposed_data = list(map(list, zip(*[tuple([i[name] for name in i.keys()]) for i in batch])))
return {key: custom_collate(transposed_data[n]) for n, key in enumerate(keys)}
[docs]def single_text_hash(x: str) -> str:
"""Get text hash.
Args:
x: Text.
Returns:
String text hash.
"""
numhash = murmurhash3_32(x, seed=13)
texthash = str(numhash) if numhash > 0 else "m" + str(abs(numhash))
return texthash
[docs]def get_textarr_hash(x: Sequence[str]) -> str:
"""Get hash of array with texts.
Args:
x: Text array.
Returns:
Hash of array.
"""
full_hash = single_text_hash(str(x))
n = 0
for text in x:
if text != "":
full_hash += "_" + single_text_hash(text)
n += 1
if n >= 3:
break
return full_hash