from typing import Iterator, Tuple
import h5py
import hdf5plugin
import numpy as np
import pandas as pd
[docs]
def one_hot_encode(sequences):
"""
One-hot encodes a sequence.
"""
# Define the mapping of each amino acid to a unique integer
aa_to_int = {
"0": 0,
"A": 1,
"C": 2,
"D": 3,
"E": 4,
"F": 5,
"G": 6,
"H": 7,
"I": 8,
"K": 9,
"L": 10,
"M": 11,
"N": 12,
"P": 13,
"Q": 14,
"R": 15,
"S": 16,
"T": 17,
"V": 18,
"W": 19,
"Y": 20,
}
# Convert sequences to numpy array of integers
int_sequences = [[aa_to_int[aa] for aa in sequence] for sequence in sequences]
int_sequences = np.array(int_sequences)
# Create an array of zeros with shape (num_sequences, sequence_length, num_classes)
num_sequences = len(sequences)
sequence_length = max(len(sequence) for sequence in sequences)
num_classes = len(aa_to_int)
one_hot_encoded_seq = np.zeros(
(num_sequences, sequence_length, num_classes), dtype=np.float32
)
# Use advanced indexing to set the appropriate elements to 1
for i, sequence in enumerate(int_sequences):
one_hot_encoded_seq[i, np.arange(len(sequence)), sequence] = 1
return one_hot_encoded_seq
[docs]
def MaxPad(original_array: np.array, shape: tuple) -> np.array:
"""
A function that takes in a distance map and pads it to a desired shape
Parameters
----------
original_array : np.array
A distance map
Returns
-------
np.array
A distance map padded to a desired shape
"""
# Pad the distance map to a desired shape
pad_height = max(0, shape[0] - original_array.shape[0])
pad_width = max(0, shape[1] - original_array.shape[1])
return np.pad(
original_array,
((0, pad_height), (0, pad_width)),
mode="constant",
constant_values=0,
)
[docs]
def load_hdf5_compressed(file_path, frame=None, keys_to_load=None):
"""
Loads data from an HDF5 file, optionally for a specific frame.
Parameters:
- file_path (str): Path to the HDF5 file.
- frame (int, optional): Specific frame to load from 'dm' or 'latents'. If None, loads full datasets.
- keys_to_load (list, optional): Specific dataset keys to load. If None, loads all available keys.
Returns:
- dict: {key: np.ndarray or np.ndarray slice}
"""
special_keys = {"dm", "latents"}
data = {}
with h5py.File(file_path, "r") as f:
keys = keys_to_load if keys_to_load is not None else list(f.keys())
for key in keys:
dataset = f[key]
if frame is not None and key in special_keys and dataset.shape[0] > frame:
data[key] = dataset[frame]
else:
data[key] = dataset[()]
return data
[docs]
def read_tsv_file(tsv_file: str) -> Iterator[Tuple[str, str]]:
"""
A function that reads the paths to distance maps from a tsv file
Parameters
----------
tsv_file : str
A path to a tsv file containing the paths to distance maps as a first column
and index of a distance map to load as a second column
Returns
-------
Iterator[Tuple[str, str]]
An iterator of tuples containing paths to distance maps and their indices
"""
df = pd.read_csv(tsv_file, sep="\t", header=None, usecols=[0, 1])
return df
[docs]
def symmetrize(matrix):
"""
Symmetrizes a matrix.
"""
if np.array_equal(matrix, matrix.T):
return matrix
else:
# Extract upper triangle excluding diagonal
upper_triangle = np.triu(matrix, k=1)
# Symmetrize upper triangle by mirroring
sym_matrix = upper_triangle + upper_triangle.T
# Add diagonal elements (to handle odd-sized matrices)
sym_matrix += np.diag(np.diag(matrix))
return sym_matrix