Alignment API

This section documents the components responsible for structural alignment in FlatProt, primarily focused on finding the best match in a reference database and retrieving the associated transformation matrix.

Alignment Concept

The alignment process in FlatProt serves to orient an input protein structure according to a standardized reference frame, typically based on its structural superfamily.

  1. Structural Search: It uses an external tool, Foldseek, to search a pre-compiled database of reference structures (e.g., CATH domains) for the best structural match to the input protein.
  2. Result Filtering: The Foldseek results are filtered based on metrics like alignment probability (--min-probability) or by specifying a direct target ID (--target-db-id).
  3. Matrix Retrieval: Once a suitable match is identified (represented by a target_id from Foldseek), FlatProt queries its internal HDF5 database (AlignmentDatabase) using this target_id. This database stores pre-calculated 4x4 transformation matrices that map the reference structure (the target) to a standardized orientation for its superfamily.
  4. Output: The primary output is the retrieved transformation matrix (TransformationMatrix), which can then be used by the project command to render the input structure in the standardized orientation. Alignment metadata (scores, matched IDs) can also be saved.

Top-Level Alignment Functions

These functions provide the main entry points for performing alignment using a database.

AlignmentDBEntry dataclass

Stores alignment data with its rotation matrix.

Source code in src/flatprot/alignment/db.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
@dataclass
class AlignmentDBEntry:
    """Stores alignment data with its rotation matrix."""

    rotation_matrix: TransformationMatrix
    entry_id: str
    structure_name: str
    metadata: Optional[Dict[str, float | str]] = None

    def __eq__(self, other: object) -> bool:
        if not isinstance(other, AlignmentDBEntry):
            return False
        return (
            np.allclose(
                self.rotation_matrix.to_array(), other.rotation_matrix.to_array()
            )
            and self.entry_id == other.entry_id
            and self.structure_name == other.structure_name
        )

AlignmentDatabase

Handles alignment database using HDF5 storage with memory-mapped arrays.

Source code in src/flatprot/alignment/db.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
class AlignmentDatabase:
    """Handles alignment database using HDF5 storage with memory-mapped arrays."""

    def __init__(self, path: Path):
        self.path = path
        self._file: Optional[h5py.File] = None
        self._structure_name_index: dict[str, str] = {}  # structure_name -> entry_id

    def __enter__(self):
        self.open()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def open(self) -> None:
        """Opens the database file in read/write mode."""
        self._file = h5py.File(self.path, "a")
        self._load_index()

    def close(self) -> None:
        """Closes the database file."""
        if self._file is not None:
            self._file.close()
            self._file = None

    def _load_index(self) -> None:
        """Loads structure name index from HDF5 file."""
        if "index" not in self._file:
            return

        names = self._file["index/structure_names"][:]
        ids = self._file["index/entry_ids"][:]
        self._structure_name_index = dict(zip(names, ids))

    def _save_index(self) -> None:
        """Saves structure name index to HDF5 file."""
        if "index" in self._file:
            del self._file["index"]

        index = self._file.create_group("index")
        names = list(self._structure_name_index.keys())
        ids = list(self._structure_name_index.values())
        index.create_dataset("structure_names", data=names)
        index.create_dataset("entry_ids", data=ids)

    def contains_entry_id(self, entry_id: str) -> bool:
        """Checks if an entry_id exists in the database. O(1) lookup."""
        if self._file is None:
            raise RuntimeError("Database not opened")
        return entry_id in self._file

    def contains_structure_name(self, structure_name: str) -> bool:
        """Checks if a structure_name exists in the database. O(1) lookup."""
        return structure_name in self._structure_name_index

    def get_by_entry_id(
        self, entry_id: str, default: Optional[AlignmentDBEntry] = None
    ) -> Optional[AlignmentDBEntry]:
        """Returns alignment entry for given entry_id or default if not found. O(1) lookup."""
        if self._file is None:
            raise RuntimeError("Database not opened")
        if entry_id not in self._file:
            return default

        entry_group = self._file[entry_id]
        metadata = {k: v for k, v in entry_group.attrs.items() if k != "structure_name"}
        return AlignmentDBEntry(
            rotation_matrix=TransformationMatrix.from_array(entry_group["rotation"][:]),
            entry_id=entry_id,
            structure_name=entry_group.attrs["structure_name"],
            metadata=metadata if metadata else None,
        )

    def get_by_structure_name(
        self, structure_name: str, default: Optional[AlignmentDBEntry] = None
    ) -> Optional[AlignmentDBEntry]:
        """Returns alignment entry for given structure_name or default if not found. O(1) lookup."""
        if structure_name not in self._structure_name_index:
            return default
        entry_id = self._structure_name_index[structure_name]
        return self.get_by_entry_id(entry_id)

    def add_entry(self, entry: AlignmentDBEntry) -> None:
        """Adds a new entry to the database."""
        if self._file is None:
            raise RuntimeError("Database not opened")

        if entry.entry_id in self._file:
            raise ValueError(f"Entry ID {entry.entry_id} already exists")

        if entry.structure_name in self._structure_name_index:
            raise ValueError(f"Structure name {entry.structure_name} already exists")

        # Create entry group and save data
        entry_group = self._file.create_group(entry.entry_id)
        entry_group.create_dataset("rotation", data=entry.rotation_matrix.to_array())
        entry_group.attrs["structure_name"] = entry.structure_name

        # Store metadata if available
        if entry.metadata:
            for key, value in entry.metadata.items():
                entry_group.attrs[key] = value

        # Update index
        self._structure_name_index[entry.structure_name] = entry.entry_id
        self._save_index()

    def update(self, entry: AlignmentDBEntry) -> None:
        """Updates an existing entry in the database."""
        if self._file is None:
            raise RuntimeError("Database not opened")

        if entry.entry_id not in self._file:
            raise KeyError(f"Entry ID {entry.entry_id} not found in database")

        old_entry = self.get_by_entry_id(entry.entry_id)

        # Check structure name conflicts
        if (
            entry.structure_name != old_entry.structure_name
            and entry.structure_name in self._structure_name_index
        ):
            raise ValueError(f"Structure name {entry.structure_name} already exists")

        # Update index if structure name changed
        if entry.structure_name != old_entry.structure_name:
            del self._structure_name_index[old_entry.structure_name]
            self._structure_name_index[entry.structure_name] = entry.entry_id

        # Update entry data
        del self._file[entry.entry_id]
        entry_group = self._file.create_group(entry.entry_id)
        entry_group.create_dataset("rotation", data=entry.rotation_matrix.to_array())
        entry_group.attrs["structure_name"] = entry.structure_name

        # Store metadata if available
        if entry.metadata:
            for key, value in entry.metadata.items():
                entry_group.attrs[key] = value

        # Save index
        self._save_index()

add_entry(entry)

Adds a new entry to the database.

Source code in src/flatprot/alignment/db.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def add_entry(self, entry: AlignmentDBEntry) -> None:
    """Adds a new entry to the database."""
    if self._file is None:
        raise RuntimeError("Database not opened")

    if entry.entry_id in self._file:
        raise ValueError(f"Entry ID {entry.entry_id} already exists")

    if entry.structure_name in self._structure_name_index:
        raise ValueError(f"Structure name {entry.structure_name} already exists")

    # Create entry group and save data
    entry_group = self._file.create_group(entry.entry_id)
    entry_group.create_dataset("rotation", data=entry.rotation_matrix.to_array())
    entry_group.attrs["structure_name"] = entry.structure_name

    # Store metadata if available
    if entry.metadata:
        for key, value in entry.metadata.items():
            entry_group.attrs[key] = value

    # Update index
    self._structure_name_index[entry.structure_name] = entry.entry_id
    self._save_index()

close()

Closes the database file.

Source code in src/flatprot/alignment/db.py
55
56
57
58
59
def close(self) -> None:
    """Closes the database file."""
    if self._file is not None:
        self._file.close()
        self._file = None

contains_entry_id(entry_id)

Checks if an entry_id exists in the database. O(1) lookup.

Source code in src/flatprot/alignment/db.py
81
82
83
84
85
def contains_entry_id(self, entry_id: str) -> bool:
    """Checks if an entry_id exists in the database. O(1) lookup."""
    if self._file is None:
        raise RuntimeError("Database not opened")
    return entry_id in self._file

contains_structure_name(structure_name)

Checks if a structure_name exists in the database. O(1) lookup.

Source code in src/flatprot/alignment/db.py
87
88
89
def contains_structure_name(self, structure_name: str) -> bool:
    """Checks if a structure_name exists in the database. O(1) lookup."""
    return structure_name in self._structure_name_index

get_by_entry_id(entry_id, default=None)

Returns alignment entry for given entry_id or default if not found. O(1) lookup.

Source code in src/flatprot/alignment/db.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def get_by_entry_id(
    self, entry_id: str, default: Optional[AlignmentDBEntry] = None
) -> Optional[AlignmentDBEntry]:
    """Returns alignment entry for given entry_id or default if not found. O(1) lookup."""
    if self._file is None:
        raise RuntimeError("Database not opened")
    if entry_id not in self._file:
        return default

    entry_group = self._file[entry_id]
    metadata = {k: v for k, v in entry_group.attrs.items() if k != "structure_name"}
    return AlignmentDBEntry(
        rotation_matrix=TransformationMatrix.from_array(entry_group["rotation"][:]),
        entry_id=entry_id,
        structure_name=entry_group.attrs["structure_name"],
        metadata=metadata if metadata else None,
    )

get_by_structure_name(structure_name, default=None)

Returns alignment entry for given structure_name or default if not found. O(1) lookup.

Source code in src/flatprot/alignment/db.py
109
110
111
112
113
114
115
116
def get_by_structure_name(
    self, structure_name: str, default: Optional[AlignmentDBEntry] = None
) -> Optional[AlignmentDBEntry]:
    """Returns alignment entry for given structure_name or default if not found. O(1) lookup."""
    if structure_name not in self._structure_name_index:
        return default
    entry_id = self._structure_name_index[structure_name]
    return self.get_by_entry_id(entry_id)

open()

Opens the database file in read/write mode.

Source code in src/flatprot/alignment/db.py
50
51
52
53
def open(self) -> None:
    """Opens the database file in read/write mode."""
    self._file = h5py.File(self.path, "a")
    self._load_index()

update(entry)

Updates an existing entry in the database.

Source code in src/flatprot/alignment/db.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def update(self, entry: AlignmentDBEntry) -> None:
    """Updates an existing entry in the database."""
    if self._file is None:
        raise RuntimeError("Database not opened")

    if entry.entry_id not in self._file:
        raise KeyError(f"Entry ID {entry.entry_id} not found in database")

    old_entry = self.get_by_entry_id(entry.entry_id)

    # Check structure name conflicts
    if (
        entry.structure_name != old_entry.structure_name
        and entry.structure_name in self._structure_name_index
    ):
        raise ValueError(f"Structure name {entry.structure_name} already exists")

    # Update index if structure name changed
    if entry.structure_name != old_entry.structure_name:
        del self._structure_name_index[old_entry.structure_name]
        self._structure_name_index[entry.structure_name] = entry.entry_id

    # Update entry data
    del self._file[entry.entry_id]
    entry_group = self._file.create_group(entry.entry_id)
    entry_group.create_dataset("rotation", data=entry.rotation_matrix.to_array())
    entry_group.attrs["structure_name"] = entry.structure_name

    # Store metadata if available
    if entry.metadata:
        for key, value in entry.metadata.items():
            entry_group.attrs[key] = value

    # Save index
    self._save_index()

AlignmentError

Bases: FlatProtError

Base class for alignment-related errors.

Source code in src/flatprot/alignment/errors.py
 8
 9
10
11
class AlignmentError(FlatProtError):
    """Base class for alignment-related errors."""

    pass

AlignmentResult

Bases: NamedTuple

Results from a structural family alignment.

Source code in src/flatprot/alignment/foldseek.py
15
16
17
18
19
20
21
22
class AlignmentResult(NamedTuple):
    """Results from a structural family alignment."""

    db_id: str
    probability: float
    aligned_region: np.ndarray
    alignment_scores: np.ndarray
    rotation_matrix: TransformationMatrix

DatabaseEntryNotFoundError

Bases: AlignmentError

Raised when a database entry is not found.

Source code in src/flatprot/alignment/errors.py
20
21
22
23
class DatabaseEntryNotFoundError(AlignmentError):
    """Raised when a database entry is not found."""

    pass

FoldseekAligner

Handles structural family alignments using FoldSeek.

Source code in src/flatprot/alignment/foldseek.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
class FoldseekAligner:
    """Handles structural family alignments using FoldSeek."""

    def __init__(
        self,
        foldseek_executable: str,
        database_path: Path,
    ):
        self.foldseek_executable = foldseek_executable
        self.database_path = database_path

    def align_structure(
        self,
        structure_path: Path,
        min_probability: float = 0.5,
        fixed_alignment_id: Optional[str] = None,
        tmp_dir: Optional[Path] = None,
    ) -> Optional[AlignmentResult]:
        """Aligns structure to family database and returns best match."""

        if tmp_dir is None:
            tmp_dir = tempfile.TemporaryDirectory(
                ignore_cleanup_errors=True, prefix="foldseek_"
            )
            tmp_dir_path = Path(tmp_dir.name)
        else:
            tmp_dir_path = tmp_dir

        # Run FoldSeek search
        result_file = tmp_dir_path / "foldseek_result.tsv"
        foldseek_tmp_dir = tmp_dir_path / "foldseek_tmp"

        self._run_foldseek_search(structure_path, result_file, foldseek_tmp_dir)

        if not result_file.exists():
            raise RuntimeError("FoldSeek result file not found")

        # Parse results
        results = pl.read_csv(
            result_file,
            separator="\t",
            has_header=False,
            new_columns=[
                "query",
                "target",
                "qstart",
                "qend",
                "tstart",
                "tend",
                "tseq",
                "prob",
                "alntmscore",
                "u",
                "t",
                "lddtfull",
            ],
            schema_overrides={
                "query": pl.Utf8,
                "target": pl.Utf8,
            },
        )
        if len(results) == 0:
            return None

        # Get best match (or fixed family if specified)
        if fixed_alignment_id:
            match = results.filter(pl.col("target") == fixed_alignment_id)
        else:
            match = results.sort("prob", descending=True)
            if match[0, "prob"] < min_probability:
                return None

        if isinstance(tmp_dir, tempfile.TemporaryDirectory):
            tmp_dir.cleanup()

        target_to_query_matrix = _parse_foldseek_vector(match[0, "u"]).reshape(3, 3)
        target_to_query_translation = _parse_foldseek_vector(match[0, "t"])

        # Manually calculate the inverse of the alignment transformation
        # Inverse rotation is the transpose of the rotation matrix
        R_align_inv = target_to_query_matrix.T
        # Inverse translation is -R_inv @ t
        t_align_inv = -R_align_inv @ target_to_query_translation
        query_to_target_transform = TransformationMatrix(
            rotation=R_align_inv, translation=t_align_inv
        )

        return AlignmentResult(
            db_id=match[0, "target"],
            probability=float(match[0, "prob"]),
            aligned_region=np.array((int(match[0, "qstart"]), int(match[0, "qend"]))),
            alignment_scores=_parse_foldseek_vector(match[0, "lddtfull"]),
            rotation_matrix=query_to_target_transform,
        )

    def align_structures_batch(
        self,
        structure_paths: List[Path],
        min_probability: float = 0.5,
        fixed_alignment_id: Optional[str] = None,
        tmp_dir: Optional[Path] = None,
    ) -> Dict[Path, Optional[AlignmentResult]]:
        """Batch align multiple structures to family database for better performance.

        Args:
            structure_paths: List of structure file paths to align
            min_probability: Minimum alignment probability threshold
            fixed_alignment_id: Optional fixed family ID for all alignments
            tmp_dir: Optional temporary directory path

        Returns:
            Dictionary mapping structure paths to their alignment results
        """
        if tmp_dir is None:
            tmp_dir = tempfile.TemporaryDirectory(
                ignore_cleanup_errors=True, prefix="foldseek_batch_"
            )
            tmp_dir_path = Path(tmp_dir.name)
        else:
            tmp_dir_path = tmp_dir

        # Create input directory with all structures
        structures_dir = tmp_dir_path / "batch_structures"
        structures_dir.mkdir(parents=True, exist_ok=True)

        # Copy all structures to batch directory
        structure_mapping = {}
        for i, structure_path in enumerate(structure_paths):
            # Use index to avoid naming conflicts
            batch_filename = (
                f"structure_{i}_{structure_path.stem}{structure_path.suffix}"
            )
            batch_file = structures_dir / batch_filename
            batch_file.write_bytes(structure_path.read_bytes())
            structure_mapping[batch_filename] = structure_path

        # Run batch foldseek search
        result_file = tmp_dir_path / "batch_foldseek_result.tsv"
        foldseek_tmp_dir = tmp_dir_path / "foldseek_tmp"

        self._run_foldseek_batch_search(structures_dir, result_file, foldseek_tmp_dir)

        if not result_file.exists():
            raise RuntimeError("FoldSeek batch result file not found")

        # Parse batch results
        try:
            results = pl.read_csv(
                result_file,
                separator="\t",
                has_header=False,
                new_columns=[
                    "query",
                    "target",
                    "qstart",
                    "qend",
                    "tstart",
                    "tend",
                    "tseq",
                    "prob",
                    "alntmscore",
                    "u",
                    "t",
                    "lddtfull",
                ],
                schema_overrides={
                    "query": pl.Utf8,
                    "target": pl.Utf8,
                },
            )
        except Exception:
            # Return empty results if parsing fails
            return {path: None for path in structure_paths}

        # Group results by query and process each structure
        alignment_results = {}
        for structure_path in structure_paths:
            # Find the batch filename for this structure
            batch_filename = None
            for batch_name, orig_path in structure_mapping.items():
                if orig_path == structure_path:
                    batch_filename = batch_name
                    break

            if batch_filename is None:
                alignment_results[structure_path] = None
                continue

            # Filter results for this specific query
            query_results = results.filter(
                pl.col("query").str.contains(Path(batch_filename).stem)
            )

            if len(query_results) == 0:
                alignment_results[structure_path] = None
                continue

            # Get best match (or fixed family if specified)
            if fixed_alignment_id:
                match = query_results.filter(pl.col("target") == fixed_alignment_id)
                if len(match) == 0:
                    alignment_results[structure_path] = None
                    continue
            else:
                match = query_results.sort("prob", descending=True)
                if match[0, "prob"] < min_probability:
                    alignment_results[structure_path] = None
                    continue

            # Create alignment result
            target_to_query_matrix = _parse_foldseek_vector(match[0, "u"]).reshape(3, 3)
            target_to_query_translation = _parse_foldseek_vector(match[0, "t"])

            # Calculate inverse transformation
            R_align_inv = target_to_query_matrix.T
            t_align_inv = -R_align_inv @ target_to_query_translation
            query_to_target_transform = TransformationMatrix(
                rotation=R_align_inv, translation=t_align_inv
            )

            alignment_results[structure_path] = AlignmentResult(
                db_id=match[0, "target"],
                probability=float(match[0, "prob"]),
                aligned_region=np.array(
                    (int(match[0, "qstart"]), int(match[0, "qend"]))
                ),
                alignment_scores=_parse_foldseek_vector(match[0, "lddtfull"]),
                rotation_matrix=query_to_target_transform,
            )

        if isinstance(tmp_dir, tempfile.TemporaryDirectory):
            tmp_dir.cleanup()

        return alignment_results

    def _run_foldseek_batch_search(
        self, structures_dir: Path, output_file: Path, tmp_dir: Path
    ) -> None:
        """Runs FoldSeek batch search against family database."""
        cmd = [
            self.foldseek_executable,
            "easy-search",
            str(structures_dir),
            str(self.database_path),
            str(output_file),
            str(tmp_dir),
            "--format-output",
            "query,target,qstart,qend,tstart,tend,tseq,prob,alntmscore,u,t,lddtfull",
        ]

        subprocess.run(cmd, check=True, capture_output=True, text=True)

    def _run_foldseek_search(
        self, structure_path: Path, output_file: Path, tmp_dir: Path
    ) -> None:
        """Runs FoldSeek search against family database."""
        cmd = [
            self.foldseek_executable,
            "easy-search",
            str(structure_path),
            str(self.database_path),
            str(output_file),
            str(tmp_dir),
            "--format-output",
            "query,target,qstart,qend,tstart,tend,tseq,prob,alntmscore,u,t,lddtfull",
        ]

        subprocess.run(cmd, check=True, capture_output=True, text=True)

align_structure(structure_path, min_probability=0.5, fixed_alignment_id=None, tmp_dir=None)

Aligns structure to family database and returns best match.

Source code in src/flatprot/alignment/foldseek.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def align_structure(
    self,
    structure_path: Path,
    min_probability: float = 0.5,
    fixed_alignment_id: Optional[str] = None,
    tmp_dir: Optional[Path] = None,
) -> Optional[AlignmentResult]:
    """Aligns structure to family database and returns best match."""

    if tmp_dir is None:
        tmp_dir = tempfile.TemporaryDirectory(
            ignore_cleanup_errors=True, prefix="foldseek_"
        )
        tmp_dir_path = Path(tmp_dir.name)
    else:
        tmp_dir_path = tmp_dir

    # Run FoldSeek search
    result_file = tmp_dir_path / "foldseek_result.tsv"
    foldseek_tmp_dir = tmp_dir_path / "foldseek_tmp"

    self._run_foldseek_search(structure_path, result_file, foldseek_tmp_dir)

    if not result_file.exists():
        raise RuntimeError("FoldSeek result file not found")

    # Parse results
    results = pl.read_csv(
        result_file,
        separator="\t",
        has_header=False,
        new_columns=[
            "query",
            "target",
            "qstart",
            "qend",
            "tstart",
            "tend",
            "tseq",
            "prob",
            "alntmscore",
            "u",
            "t",
            "lddtfull",
        ],
        schema_overrides={
            "query": pl.Utf8,
            "target": pl.Utf8,
        },
    )
    if len(results) == 0:
        return None

    # Get best match (or fixed family if specified)
    if fixed_alignment_id:
        match = results.filter(pl.col("target") == fixed_alignment_id)
    else:
        match = results.sort("prob", descending=True)
        if match[0, "prob"] < min_probability:
            return None

    if isinstance(tmp_dir, tempfile.TemporaryDirectory):
        tmp_dir.cleanup()

    target_to_query_matrix = _parse_foldseek_vector(match[0, "u"]).reshape(3, 3)
    target_to_query_translation = _parse_foldseek_vector(match[0, "t"])

    # Manually calculate the inverse of the alignment transformation
    # Inverse rotation is the transpose of the rotation matrix
    R_align_inv = target_to_query_matrix.T
    # Inverse translation is -R_inv @ t
    t_align_inv = -R_align_inv @ target_to_query_translation
    query_to_target_transform = TransformationMatrix(
        rotation=R_align_inv, translation=t_align_inv
    )

    return AlignmentResult(
        db_id=match[0, "target"],
        probability=float(match[0, "prob"]),
        aligned_region=np.array((int(match[0, "qstart"]), int(match[0, "qend"]))),
        alignment_scores=_parse_foldseek_vector(match[0, "lddtfull"]),
        rotation_matrix=query_to_target_transform,
    )

align_structures_batch(structure_paths, min_probability=0.5, fixed_alignment_id=None, tmp_dir=None)

Batch align multiple structures to family database for better performance.

Parameters:
  • structure_paths (List[Path]) –

    List of structure file paths to align

  • min_probability (float, default: 0.5 ) –

    Minimum alignment probability threshold

  • fixed_alignment_id (Optional[str], default: None ) –

    Optional fixed family ID for all alignments

  • tmp_dir (Optional[Path], default: None ) –

    Optional temporary directory path

Returns:
  • Dict[Path, Optional[AlignmentResult]]

    Dictionary mapping structure paths to their alignment results

Source code in src/flatprot/alignment/foldseek.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def align_structures_batch(
    self,
    structure_paths: List[Path],
    min_probability: float = 0.5,
    fixed_alignment_id: Optional[str] = None,
    tmp_dir: Optional[Path] = None,
) -> Dict[Path, Optional[AlignmentResult]]:
    """Batch align multiple structures to family database for better performance.

    Args:
        structure_paths: List of structure file paths to align
        min_probability: Minimum alignment probability threshold
        fixed_alignment_id: Optional fixed family ID for all alignments
        tmp_dir: Optional temporary directory path

    Returns:
        Dictionary mapping structure paths to their alignment results
    """
    if tmp_dir is None:
        tmp_dir = tempfile.TemporaryDirectory(
            ignore_cleanup_errors=True, prefix="foldseek_batch_"
        )
        tmp_dir_path = Path(tmp_dir.name)
    else:
        tmp_dir_path = tmp_dir

    # Create input directory with all structures
    structures_dir = tmp_dir_path / "batch_structures"
    structures_dir.mkdir(parents=True, exist_ok=True)

    # Copy all structures to batch directory
    structure_mapping = {}
    for i, structure_path in enumerate(structure_paths):
        # Use index to avoid naming conflicts
        batch_filename = (
            f"structure_{i}_{structure_path.stem}{structure_path.suffix}"
        )
        batch_file = structures_dir / batch_filename
        batch_file.write_bytes(structure_path.read_bytes())
        structure_mapping[batch_filename] = structure_path

    # Run batch foldseek search
    result_file = tmp_dir_path / "batch_foldseek_result.tsv"
    foldseek_tmp_dir = tmp_dir_path / "foldseek_tmp"

    self._run_foldseek_batch_search(structures_dir, result_file, foldseek_tmp_dir)

    if not result_file.exists():
        raise RuntimeError("FoldSeek batch result file not found")

    # Parse batch results
    try:
        results = pl.read_csv(
            result_file,
            separator="\t",
            has_header=False,
            new_columns=[
                "query",
                "target",
                "qstart",
                "qend",
                "tstart",
                "tend",
                "tseq",
                "prob",
                "alntmscore",
                "u",
                "t",
                "lddtfull",
            ],
            schema_overrides={
                "query": pl.Utf8,
                "target": pl.Utf8,
            },
        )
    except Exception:
        # Return empty results if parsing fails
        return {path: None for path in structure_paths}

    # Group results by query and process each structure
    alignment_results = {}
    for structure_path in structure_paths:
        # Find the batch filename for this structure
        batch_filename = None
        for batch_name, orig_path in structure_mapping.items():
            if orig_path == structure_path:
                batch_filename = batch_name
                break

        if batch_filename is None:
            alignment_results[structure_path] = None
            continue

        # Filter results for this specific query
        query_results = results.filter(
            pl.col("query").str.contains(Path(batch_filename).stem)
        )

        if len(query_results) == 0:
            alignment_results[structure_path] = None
            continue

        # Get best match (or fixed family if specified)
        if fixed_alignment_id:
            match = query_results.filter(pl.col("target") == fixed_alignment_id)
            if len(match) == 0:
                alignment_results[structure_path] = None
                continue
        else:
            match = query_results.sort("prob", descending=True)
            if match[0, "prob"] < min_probability:
                alignment_results[structure_path] = None
                continue

        # Create alignment result
        target_to_query_matrix = _parse_foldseek_vector(match[0, "u"]).reshape(3, 3)
        target_to_query_translation = _parse_foldseek_vector(match[0, "t"])

        # Calculate inverse transformation
        R_align_inv = target_to_query_matrix.T
        t_align_inv = -R_align_inv @ target_to_query_translation
        query_to_target_transform = TransformationMatrix(
            rotation=R_align_inv, translation=t_align_inv
        )

        alignment_results[structure_path] = AlignmentResult(
            db_id=match[0, "target"],
            probability=float(match[0, "prob"]),
            aligned_region=np.array(
                (int(match[0, "qstart"]), int(match[0, "qend"]))
            ),
            alignment_scores=_parse_foldseek_vector(match[0, "lddtfull"]),
            rotation_matrix=query_to_target_transform,
        )

    if isinstance(tmp_dir, tempfile.TemporaryDirectory):
        tmp_dir.cleanup()

    return alignment_results

NoSignificantAlignmentError

Bases: AlignmentError

Raised when no significant alignment is found.

Source code in src/flatprot/alignment/errors.py
14
15
16
17
class NoSignificantAlignmentError(AlignmentError):
    """Raised when no significant alignment is found."""

    pass

align_structure_database(structure_file, foldseek_db_path, foldseek_command='foldseek', min_probability=0.5, target_db_id=None)

Calculate the alignment result for structural alignment using FoldSeek.

Parameters:
  • structure_file (Path) –

    Path to input protein structure file (PDB/mmCIF).

  • foldseek_db_path (Path) –

    Path to FoldSeek-specific database.

  • foldseek_command (str, default: 'foldseek' ) –

    FoldSeek executable name/path.

  • min_probability (float, default: 0.5 ) –

    Minimum alignment probability threshold.

  • target_db_id (Optional[str], default: None ) –

    If provided, force alignment to this specific FoldSeek target ID.

Returns:
  • AlignmentResult( AlignmentResult ) –

    Result containing alignment details and rotation matrix.

Raises:
Source code in src/flatprot/alignment/utils.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def align_structure_database(
    structure_file: Path,
    foldseek_db_path: Path,
    foldseek_command: str = "foldseek",
    min_probability: float = 0.5,
    target_db_id: Optional[str] = None,
) -> AlignmentResult:
    """Calculate the alignment result for structural alignment using FoldSeek.

    Args:
        structure_file: Path to input protein structure file (PDB/mmCIF).
        foldseek_db_path: Path to FoldSeek-specific database.
        foldseek_command: FoldSeek executable name/path.
        min_probability: Minimum alignment probability threshold.
        target_db_id: If provided, force alignment to this specific FoldSeek target ID.

    Returns:
        AlignmentResult: Result containing alignment details and rotation matrix.

    Raises:
        AlignmentError: General alignment failures
        NoSignificantAlignmentError: No alignment meets probability threshold
    """
    aligner = FoldseekAligner(
        foldseek_executable=foldseek_command, database_path=foldseek_db_path
    )

    alignment_result = aligner.align_structure(
        structure_path=structure_file,
        min_probability=min_probability,
        fixed_alignment_id=target_db_id,
    )

    if alignment_result is None:
        raise NoSignificantAlignmentError(
            f"No alignment found above {min_probability} probability threshold"
        )

    return alignment_result

get_aligned_rotation_database(alignment, db, id_transform=_foldseek_id_to_db_id)

Combines alignment rotation with database rotation.

Parameters:
  • alignment (AlignmentResult) –

    Alignment result from align_structure_database

  • db (AlignmentDatabase) –

    Initialized AlignmentDatabase instance

  • id_transform (Callable[[str], str], default: _foldseek_id_to_db_id ) –

    Function to transform FoldSeek IDs to database IDs

Returns:
Raises:
Source code in src/flatprot/alignment/utils.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def get_aligned_rotation_database(
    alignment: AlignmentResult,
    db: AlignmentDatabase,
    id_transform: Callable[[str], str] = _foldseek_id_to_db_id,
) -> tuple[TransformationMatrix, AlignmentDBEntry]:
    """Combines alignment rotation with database rotation.

    Args:
        alignment: Alignment result from align_structure_database
        db: Initialized AlignmentDatabase instance
        id_transform: Function to transform FoldSeek IDs to database IDs

    Returns:
        tuple: A tuple containing:
            - Combined transformation matrix
            - Database entry object for the matched alignment

    Raises:
        DatabaseEntryNotFoundError: If matched database entry is missing
    """
    with db:
        db_id = id_transform(alignment.db_id)
        if not db.contains_entry_id(db_id):
            raise DatabaseEntryNotFoundError(
                f"Database entry {alignment.db_id} not found"
            )

        db_entry = db.get_by_entry_id(db_id)

    alignment_transform: TransformationMatrix = alignment.rotation_matrix
    db_transform: TransformationMatrix = db_entry.rotation_matrix

    # Combine the transformations: apply query_to_target_transform first, then db_transform.
    # T_final = T_db ∘ T_inv_align
    # Since T2.before(T1) applies T1 then T2, we use:
    final_transform = db_transform.before(alignment_transform)

    return final_transform, db_entry

options: members: - align_structure_database - get_aligned_rotation_database show_root_heading: true show_root_toc_entry: false

Foldseek Interaction

Classes and functions related to running Foldseek and parsing its results.

AlignmentResult

Bases: NamedTuple

Results from a structural family alignment.

Source code in src/flatprot/alignment/foldseek.py
15
16
17
18
19
20
21
22
class AlignmentResult(NamedTuple):
    """Results from a structural family alignment."""

    db_id: str
    probability: float
    aligned_region: np.ndarray
    alignment_scores: np.ndarray
    rotation_matrix: TransformationMatrix

FoldseekAligner

Handles structural family alignments using FoldSeek.

Source code in src/flatprot/alignment/foldseek.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
class FoldseekAligner:
    """Handles structural family alignments using FoldSeek."""

    def __init__(
        self,
        foldseek_executable: str,
        database_path: Path,
    ):
        self.foldseek_executable = foldseek_executable
        self.database_path = database_path

    def align_structure(
        self,
        structure_path: Path,
        min_probability: float = 0.5,
        fixed_alignment_id: Optional[str] = None,
        tmp_dir: Optional[Path] = None,
    ) -> Optional[AlignmentResult]:
        """Aligns structure to family database and returns best match."""

        if tmp_dir is None:
            tmp_dir = tempfile.TemporaryDirectory(
                ignore_cleanup_errors=True, prefix="foldseek_"
            )
            tmp_dir_path = Path(tmp_dir.name)
        else:
            tmp_dir_path = tmp_dir

        # Run FoldSeek search
        result_file = tmp_dir_path / "foldseek_result.tsv"
        foldseek_tmp_dir = tmp_dir_path / "foldseek_tmp"

        self._run_foldseek_search(structure_path, result_file, foldseek_tmp_dir)

        if not result_file.exists():
            raise RuntimeError("FoldSeek result file not found")

        # Parse results
        results = pl.read_csv(
            result_file,
            separator="\t",
            has_header=False,
            new_columns=[
                "query",
                "target",
                "qstart",
                "qend",
                "tstart",
                "tend",
                "tseq",
                "prob",
                "alntmscore",
                "u",
                "t",
                "lddtfull",
            ],
            schema_overrides={
                "query": pl.Utf8,
                "target": pl.Utf8,
            },
        )
        if len(results) == 0:
            return None

        # Get best match (or fixed family if specified)
        if fixed_alignment_id:
            match = results.filter(pl.col("target") == fixed_alignment_id)
        else:
            match = results.sort("prob", descending=True)
            if match[0, "prob"] < min_probability:
                return None

        if isinstance(tmp_dir, tempfile.TemporaryDirectory):
            tmp_dir.cleanup()

        target_to_query_matrix = _parse_foldseek_vector(match[0, "u"]).reshape(3, 3)
        target_to_query_translation = _parse_foldseek_vector(match[0, "t"])

        # Manually calculate the inverse of the alignment transformation
        # Inverse rotation is the transpose of the rotation matrix
        R_align_inv = target_to_query_matrix.T
        # Inverse translation is -R_inv @ t
        t_align_inv = -R_align_inv @ target_to_query_translation
        query_to_target_transform = TransformationMatrix(
            rotation=R_align_inv, translation=t_align_inv
        )

        return AlignmentResult(
            db_id=match[0, "target"],
            probability=float(match[0, "prob"]),
            aligned_region=np.array((int(match[0, "qstart"]), int(match[0, "qend"]))),
            alignment_scores=_parse_foldseek_vector(match[0, "lddtfull"]),
            rotation_matrix=query_to_target_transform,
        )

    def align_structures_batch(
        self,
        structure_paths: List[Path],
        min_probability: float = 0.5,
        fixed_alignment_id: Optional[str] = None,
        tmp_dir: Optional[Path] = None,
    ) -> Dict[Path, Optional[AlignmentResult]]:
        """Batch align multiple structures to family database for better performance.

        Args:
            structure_paths: List of structure file paths to align
            min_probability: Minimum alignment probability threshold
            fixed_alignment_id: Optional fixed family ID for all alignments
            tmp_dir: Optional temporary directory path

        Returns:
            Dictionary mapping structure paths to their alignment results
        """
        if tmp_dir is None:
            tmp_dir = tempfile.TemporaryDirectory(
                ignore_cleanup_errors=True, prefix="foldseek_batch_"
            )
            tmp_dir_path = Path(tmp_dir.name)
        else:
            tmp_dir_path = tmp_dir

        # Create input directory with all structures
        structures_dir = tmp_dir_path / "batch_structures"
        structures_dir.mkdir(parents=True, exist_ok=True)

        # Copy all structures to batch directory
        structure_mapping = {}
        for i, structure_path in enumerate(structure_paths):
            # Use index to avoid naming conflicts
            batch_filename = (
                f"structure_{i}_{structure_path.stem}{structure_path.suffix}"
            )
            batch_file = structures_dir / batch_filename
            batch_file.write_bytes(structure_path.read_bytes())
            structure_mapping[batch_filename] = structure_path

        # Run batch foldseek search
        result_file = tmp_dir_path / "batch_foldseek_result.tsv"
        foldseek_tmp_dir = tmp_dir_path / "foldseek_tmp"

        self._run_foldseek_batch_search(structures_dir, result_file, foldseek_tmp_dir)

        if not result_file.exists():
            raise RuntimeError("FoldSeek batch result file not found")

        # Parse batch results
        try:
            results = pl.read_csv(
                result_file,
                separator="\t",
                has_header=False,
                new_columns=[
                    "query",
                    "target",
                    "qstart",
                    "qend",
                    "tstart",
                    "tend",
                    "tseq",
                    "prob",
                    "alntmscore",
                    "u",
                    "t",
                    "lddtfull",
                ],
                schema_overrides={
                    "query": pl.Utf8,
                    "target": pl.Utf8,
                },
            )
        except Exception:
            # Return empty results if parsing fails
            return {path: None for path in structure_paths}

        # Group results by query and process each structure
        alignment_results = {}
        for structure_path in structure_paths:
            # Find the batch filename for this structure
            batch_filename = None
            for batch_name, orig_path in structure_mapping.items():
                if orig_path == structure_path:
                    batch_filename = batch_name
                    break

            if batch_filename is None:
                alignment_results[structure_path] = None
                continue

            # Filter results for this specific query
            query_results = results.filter(
                pl.col("query").str.contains(Path(batch_filename).stem)
            )

            if len(query_results) == 0:
                alignment_results[structure_path] = None
                continue

            # Get best match (or fixed family if specified)
            if fixed_alignment_id:
                match = query_results.filter(pl.col("target") == fixed_alignment_id)
                if len(match) == 0:
                    alignment_results[structure_path] = None
                    continue
            else:
                match = query_results.sort("prob", descending=True)
                if match[0, "prob"] < min_probability:
                    alignment_results[structure_path] = None
                    continue

            # Create alignment result
            target_to_query_matrix = _parse_foldseek_vector(match[0, "u"]).reshape(3, 3)
            target_to_query_translation = _parse_foldseek_vector(match[0, "t"])

            # Calculate inverse transformation
            R_align_inv = target_to_query_matrix.T
            t_align_inv = -R_align_inv @ target_to_query_translation
            query_to_target_transform = TransformationMatrix(
                rotation=R_align_inv, translation=t_align_inv
            )

            alignment_results[structure_path] = AlignmentResult(
                db_id=match[0, "target"],
                probability=float(match[0, "prob"]),
                aligned_region=np.array(
                    (int(match[0, "qstart"]), int(match[0, "qend"]))
                ),
                alignment_scores=_parse_foldseek_vector(match[0, "lddtfull"]),
                rotation_matrix=query_to_target_transform,
            )

        if isinstance(tmp_dir, tempfile.TemporaryDirectory):
            tmp_dir.cleanup()

        return alignment_results

    def _run_foldseek_batch_search(
        self, structures_dir: Path, output_file: Path, tmp_dir: Path
    ) -> None:
        """Runs FoldSeek batch search against family database."""
        cmd = [
            self.foldseek_executable,
            "easy-search",
            str(structures_dir),
            str(self.database_path),
            str(output_file),
            str(tmp_dir),
            "--format-output",
            "query,target,qstart,qend,tstart,tend,tseq,prob,alntmscore,u,t,lddtfull",
        ]

        subprocess.run(cmd, check=True, capture_output=True, text=True)

    def _run_foldseek_search(
        self, structure_path: Path, output_file: Path, tmp_dir: Path
    ) -> None:
        """Runs FoldSeek search against family database."""
        cmd = [
            self.foldseek_executable,
            "easy-search",
            str(structure_path),
            str(self.database_path),
            str(output_file),
            str(tmp_dir),
            "--format-output",
            "query,target,qstart,qend,tstart,tend,tseq,prob,alntmscore,u,t,lddtfull",
        ]

        subprocess.run(cmd, check=True, capture_output=True, text=True)

align_structure(structure_path, min_probability=0.5, fixed_alignment_id=None, tmp_dir=None)

Aligns structure to family database and returns best match.

Source code in src/flatprot/alignment/foldseek.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def align_structure(
    self,
    structure_path: Path,
    min_probability: float = 0.5,
    fixed_alignment_id: Optional[str] = None,
    tmp_dir: Optional[Path] = None,
) -> Optional[AlignmentResult]:
    """Aligns structure to family database and returns best match."""

    if tmp_dir is None:
        tmp_dir = tempfile.TemporaryDirectory(
            ignore_cleanup_errors=True, prefix="foldseek_"
        )
        tmp_dir_path = Path(tmp_dir.name)
    else:
        tmp_dir_path = tmp_dir

    # Run FoldSeek search
    result_file = tmp_dir_path / "foldseek_result.tsv"
    foldseek_tmp_dir = tmp_dir_path / "foldseek_tmp"

    self._run_foldseek_search(structure_path, result_file, foldseek_tmp_dir)

    if not result_file.exists():
        raise RuntimeError("FoldSeek result file not found")

    # Parse results
    results = pl.read_csv(
        result_file,
        separator="\t",
        has_header=False,
        new_columns=[
            "query",
            "target",
            "qstart",
            "qend",
            "tstart",
            "tend",
            "tseq",
            "prob",
            "alntmscore",
            "u",
            "t",
            "lddtfull",
        ],
        schema_overrides={
            "query": pl.Utf8,
            "target": pl.Utf8,
        },
    )
    if len(results) == 0:
        return None

    # Get best match (or fixed family if specified)
    if fixed_alignment_id:
        match = results.filter(pl.col("target") == fixed_alignment_id)
    else:
        match = results.sort("prob", descending=True)
        if match[0, "prob"] < min_probability:
            return None

    if isinstance(tmp_dir, tempfile.TemporaryDirectory):
        tmp_dir.cleanup()

    target_to_query_matrix = _parse_foldseek_vector(match[0, "u"]).reshape(3, 3)
    target_to_query_translation = _parse_foldseek_vector(match[0, "t"])

    # Manually calculate the inverse of the alignment transformation
    # Inverse rotation is the transpose of the rotation matrix
    R_align_inv = target_to_query_matrix.T
    # Inverse translation is -R_inv @ t
    t_align_inv = -R_align_inv @ target_to_query_translation
    query_to_target_transform = TransformationMatrix(
        rotation=R_align_inv, translation=t_align_inv
    )

    return AlignmentResult(
        db_id=match[0, "target"],
        probability=float(match[0, "prob"]),
        aligned_region=np.array((int(match[0, "qstart"]), int(match[0, "qend"]))),
        alignment_scores=_parse_foldseek_vector(match[0, "lddtfull"]),
        rotation_matrix=query_to_target_transform,
    )

align_structures_batch(structure_paths, min_probability=0.5, fixed_alignment_id=None, tmp_dir=None)

Batch align multiple structures to family database for better performance.

Parameters:
  • structure_paths (List[Path]) –

    List of structure file paths to align

  • min_probability (float, default: 0.5 ) –

    Minimum alignment probability threshold

  • fixed_alignment_id (Optional[str], default: None ) –

    Optional fixed family ID for all alignments

  • tmp_dir (Optional[Path], default: None ) –

    Optional temporary directory path

Returns:
  • Dict[Path, Optional[AlignmentResult]]

    Dictionary mapping structure paths to their alignment results

Source code in src/flatprot/alignment/foldseek.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def align_structures_batch(
    self,
    structure_paths: List[Path],
    min_probability: float = 0.5,
    fixed_alignment_id: Optional[str] = None,
    tmp_dir: Optional[Path] = None,
) -> Dict[Path, Optional[AlignmentResult]]:
    """Batch align multiple structures to family database for better performance.

    Args:
        structure_paths: List of structure file paths to align
        min_probability: Minimum alignment probability threshold
        fixed_alignment_id: Optional fixed family ID for all alignments
        tmp_dir: Optional temporary directory path

    Returns:
        Dictionary mapping structure paths to their alignment results
    """
    if tmp_dir is None:
        tmp_dir = tempfile.TemporaryDirectory(
            ignore_cleanup_errors=True, prefix="foldseek_batch_"
        )
        tmp_dir_path = Path(tmp_dir.name)
    else:
        tmp_dir_path = tmp_dir

    # Create input directory with all structures
    structures_dir = tmp_dir_path / "batch_structures"
    structures_dir.mkdir(parents=True, exist_ok=True)

    # Copy all structures to batch directory
    structure_mapping = {}
    for i, structure_path in enumerate(structure_paths):
        # Use index to avoid naming conflicts
        batch_filename = (
            f"structure_{i}_{structure_path.stem}{structure_path.suffix}"
        )
        batch_file = structures_dir / batch_filename
        batch_file.write_bytes(structure_path.read_bytes())
        structure_mapping[batch_filename] = structure_path

    # Run batch foldseek search
    result_file = tmp_dir_path / "batch_foldseek_result.tsv"
    foldseek_tmp_dir = tmp_dir_path / "foldseek_tmp"

    self._run_foldseek_batch_search(structures_dir, result_file, foldseek_tmp_dir)

    if not result_file.exists():
        raise RuntimeError("FoldSeek batch result file not found")

    # Parse batch results
    try:
        results = pl.read_csv(
            result_file,
            separator="\t",
            has_header=False,
            new_columns=[
                "query",
                "target",
                "qstart",
                "qend",
                "tstart",
                "tend",
                "tseq",
                "prob",
                "alntmscore",
                "u",
                "t",
                "lddtfull",
            ],
            schema_overrides={
                "query": pl.Utf8,
                "target": pl.Utf8,
            },
        )
    except Exception:
        # Return empty results if parsing fails
        return {path: None for path in structure_paths}

    # Group results by query and process each structure
    alignment_results = {}
    for structure_path in structure_paths:
        # Find the batch filename for this structure
        batch_filename = None
        for batch_name, orig_path in structure_mapping.items():
            if orig_path == structure_path:
                batch_filename = batch_name
                break

        if batch_filename is None:
            alignment_results[structure_path] = None
            continue

        # Filter results for this specific query
        query_results = results.filter(
            pl.col("query").str.contains(Path(batch_filename).stem)
        )

        if len(query_results) == 0:
            alignment_results[structure_path] = None
            continue

        # Get best match (or fixed family if specified)
        if fixed_alignment_id:
            match = query_results.filter(pl.col("target") == fixed_alignment_id)
            if len(match) == 0:
                alignment_results[structure_path] = None
                continue
        else:
            match = query_results.sort("prob", descending=True)
            if match[0, "prob"] < min_probability:
                alignment_results[structure_path] = None
                continue

        # Create alignment result
        target_to_query_matrix = _parse_foldseek_vector(match[0, "u"]).reshape(3, 3)
        target_to_query_translation = _parse_foldseek_vector(match[0, "t"])

        # Calculate inverse transformation
        R_align_inv = target_to_query_matrix.T
        t_align_inv = -R_align_inv @ target_to_query_translation
        query_to_target_transform = TransformationMatrix(
            rotation=R_align_inv, translation=t_align_inv
        )

        alignment_results[structure_path] = AlignmentResult(
            db_id=match[0, "target"],
            probability=float(match[0, "prob"]),
            aligned_region=np.array(
                (int(match[0, "qstart"]), int(match[0, "qend"]))
            ),
            alignment_scores=_parse_foldseek_vector(match[0, "lddtfull"]),
            rotation_matrix=query_to_target_transform,
        )

    if isinstance(tmp_dir, tempfile.TemporaryDirectory):
        tmp_dir.cleanup()

    return alignment_results

options: show_root_heading: true show_root_toc_entry: false

Alignment Database

Class for interacting with the HDF5 alignment database containing pre-calculated matrices and the associated data entry structure.

AlignmentDBEntry dataclass

Stores alignment data with its rotation matrix.

Source code in src/flatprot/alignment/db.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
@dataclass
class AlignmentDBEntry:
    """Stores alignment data with its rotation matrix."""

    rotation_matrix: TransformationMatrix
    entry_id: str
    structure_name: str
    metadata: Optional[Dict[str, float | str]] = None

    def __eq__(self, other: object) -> bool:
        if not isinstance(other, AlignmentDBEntry):
            return False
        return (
            np.allclose(
                self.rotation_matrix.to_array(), other.rotation_matrix.to_array()
            )
            and self.entry_id == other.entry_id
            and self.structure_name == other.structure_name
        )

AlignmentDatabase

Handles alignment database using HDF5 storage with memory-mapped arrays.

Source code in src/flatprot/alignment/db.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
class AlignmentDatabase:
    """Handles alignment database using HDF5 storage with memory-mapped arrays."""

    def __init__(self, path: Path):
        self.path = path
        self._file: Optional[h5py.File] = None
        self._structure_name_index: dict[str, str] = {}  # structure_name -> entry_id

    def __enter__(self):
        self.open()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def open(self) -> None:
        """Opens the database file in read/write mode."""
        self._file = h5py.File(self.path, "a")
        self._load_index()

    def close(self) -> None:
        """Closes the database file."""
        if self._file is not None:
            self._file.close()
            self._file = None

    def _load_index(self) -> None:
        """Loads structure name index from HDF5 file."""
        if "index" not in self._file:
            return

        names = self._file["index/structure_names"][:]
        ids = self._file["index/entry_ids"][:]
        self._structure_name_index = dict(zip(names, ids))

    def _save_index(self) -> None:
        """Saves structure name index to HDF5 file."""
        if "index" in self._file:
            del self._file["index"]

        index = self._file.create_group("index")
        names = list(self._structure_name_index.keys())
        ids = list(self._structure_name_index.values())
        index.create_dataset("structure_names", data=names)
        index.create_dataset("entry_ids", data=ids)

    def contains_entry_id(self, entry_id: str) -> bool:
        """Checks if an entry_id exists in the database. O(1) lookup."""
        if self._file is None:
            raise RuntimeError("Database not opened")
        return entry_id in self._file

    def contains_structure_name(self, structure_name: str) -> bool:
        """Checks if a structure_name exists in the database. O(1) lookup."""
        return structure_name in self._structure_name_index

    def get_by_entry_id(
        self, entry_id: str, default: Optional[AlignmentDBEntry] = None
    ) -> Optional[AlignmentDBEntry]:
        """Returns alignment entry for given entry_id or default if not found. O(1) lookup."""
        if self._file is None:
            raise RuntimeError("Database not opened")
        if entry_id not in self._file:
            return default

        entry_group = self._file[entry_id]
        metadata = {k: v for k, v in entry_group.attrs.items() if k != "structure_name"}
        return AlignmentDBEntry(
            rotation_matrix=TransformationMatrix.from_array(entry_group["rotation"][:]),
            entry_id=entry_id,
            structure_name=entry_group.attrs["structure_name"],
            metadata=metadata if metadata else None,
        )

    def get_by_structure_name(
        self, structure_name: str, default: Optional[AlignmentDBEntry] = None
    ) -> Optional[AlignmentDBEntry]:
        """Returns alignment entry for given structure_name or default if not found. O(1) lookup."""
        if structure_name not in self._structure_name_index:
            return default
        entry_id = self._structure_name_index[structure_name]
        return self.get_by_entry_id(entry_id)

    def add_entry(self, entry: AlignmentDBEntry) -> None:
        """Adds a new entry to the database."""
        if self._file is None:
            raise RuntimeError("Database not opened")

        if entry.entry_id in self._file:
            raise ValueError(f"Entry ID {entry.entry_id} already exists")

        if entry.structure_name in self._structure_name_index:
            raise ValueError(f"Structure name {entry.structure_name} already exists")

        # Create entry group and save data
        entry_group = self._file.create_group(entry.entry_id)
        entry_group.create_dataset("rotation", data=entry.rotation_matrix.to_array())
        entry_group.attrs["structure_name"] = entry.structure_name

        # Store metadata if available
        if entry.metadata:
            for key, value in entry.metadata.items():
                entry_group.attrs[key] = value

        # Update index
        self._structure_name_index[entry.structure_name] = entry.entry_id
        self._save_index()

    def update(self, entry: AlignmentDBEntry) -> None:
        """Updates an existing entry in the database."""
        if self._file is None:
            raise RuntimeError("Database not opened")

        if entry.entry_id not in self._file:
            raise KeyError(f"Entry ID {entry.entry_id} not found in database")

        old_entry = self.get_by_entry_id(entry.entry_id)

        # Check structure name conflicts
        if (
            entry.structure_name != old_entry.structure_name
            and entry.structure_name in self._structure_name_index
        ):
            raise ValueError(f"Structure name {entry.structure_name} already exists")

        # Update index if structure name changed
        if entry.structure_name != old_entry.structure_name:
            del self._structure_name_index[old_entry.structure_name]
            self._structure_name_index[entry.structure_name] = entry.entry_id

        # Update entry data
        del self._file[entry.entry_id]
        entry_group = self._file.create_group(entry.entry_id)
        entry_group.create_dataset("rotation", data=entry.rotation_matrix.to_array())
        entry_group.attrs["structure_name"] = entry.structure_name

        # Store metadata if available
        if entry.metadata:
            for key, value in entry.metadata.items():
                entry_group.attrs[key] = value

        # Save index
        self._save_index()

add_entry(entry)

Adds a new entry to the database.

Source code in src/flatprot/alignment/db.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def add_entry(self, entry: AlignmentDBEntry) -> None:
    """Adds a new entry to the database."""
    if self._file is None:
        raise RuntimeError("Database not opened")

    if entry.entry_id in self._file:
        raise ValueError(f"Entry ID {entry.entry_id} already exists")

    if entry.structure_name in self._structure_name_index:
        raise ValueError(f"Structure name {entry.structure_name} already exists")

    # Create entry group and save data
    entry_group = self._file.create_group(entry.entry_id)
    entry_group.create_dataset("rotation", data=entry.rotation_matrix.to_array())
    entry_group.attrs["structure_name"] = entry.structure_name

    # Store metadata if available
    if entry.metadata:
        for key, value in entry.metadata.items():
            entry_group.attrs[key] = value

    # Update index
    self._structure_name_index[entry.structure_name] = entry.entry_id
    self._save_index()

close()

Closes the database file.

Source code in src/flatprot/alignment/db.py
55
56
57
58
59
def close(self) -> None:
    """Closes the database file."""
    if self._file is not None:
        self._file.close()
        self._file = None

contains_entry_id(entry_id)

Checks if an entry_id exists in the database. O(1) lookup.

Source code in src/flatprot/alignment/db.py
81
82
83
84
85
def contains_entry_id(self, entry_id: str) -> bool:
    """Checks if an entry_id exists in the database. O(1) lookup."""
    if self._file is None:
        raise RuntimeError("Database not opened")
    return entry_id in self._file

contains_structure_name(structure_name)

Checks if a structure_name exists in the database. O(1) lookup.

Source code in src/flatprot/alignment/db.py
87
88
89
def contains_structure_name(self, structure_name: str) -> bool:
    """Checks if a structure_name exists in the database. O(1) lookup."""
    return structure_name in self._structure_name_index

get_by_entry_id(entry_id, default=None)

Returns alignment entry for given entry_id or default if not found. O(1) lookup.

Source code in src/flatprot/alignment/db.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def get_by_entry_id(
    self, entry_id: str, default: Optional[AlignmentDBEntry] = None
) -> Optional[AlignmentDBEntry]:
    """Returns alignment entry for given entry_id or default if not found. O(1) lookup."""
    if self._file is None:
        raise RuntimeError("Database not opened")
    if entry_id not in self._file:
        return default

    entry_group = self._file[entry_id]
    metadata = {k: v for k, v in entry_group.attrs.items() if k != "structure_name"}
    return AlignmentDBEntry(
        rotation_matrix=TransformationMatrix.from_array(entry_group["rotation"][:]),
        entry_id=entry_id,
        structure_name=entry_group.attrs["structure_name"],
        metadata=metadata if metadata else None,
    )

get_by_structure_name(structure_name, default=None)

Returns alignment entry for given structure_name or default if not found. O(1) lookup.

Source code in src/flatprot/alignment/db.py
109
110
111
112
113
114
115
116
def get_by_structure_name(
    self, structure_name: str, default: Optional[AlignmentDBEntry] = None
) -> Optional[AlignmentDBEntry]:
    """Returns alignment entry for given structure_name or default if not found. O(1) lookup."""
    if structure_name not in self._structure_name_index:
        return default
    entry_id = self._structure_name_index[structure_name]
    return self.get_by_entry_id(entry_id)

open()

Opens the database file in read/write mode.

Source code in src/flatprot/alignment/db.py
50
51
52
53
def open(self) -> None:
    """Opens the database file in read/write mode."""
    self._file = h5py.File(self.path, "a")
    self._load_index()

update(entry)

Updates an existing entry in the database.

Source code in src/flatprot/alignment/db.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def update(self, entry: AlignmentDBEntry) -> None:
    """Updates an existing entry in the database."""
    if self._file is None:
        raise RuntimeError("Database not opened")

    if entry.entry_id not in self._file:
        raise KeyError(f"Entry ID {entry.entry_id} not found in database")

    old_entry = self.get_by_entry_id(entry.entry_id)

    # Check structure name conflicts
    if (
        entry.structure_name != old_entry.structure_name
        and entry.structure_name in self._structure_name_index
    ):
        raise ValueError(f"Structure name {entry.structure_name} already exists")

    # Update index if structure name changed
    if entry.structure_name != old_entry.structure_name:
        del self._structure_name_index[old_entry.structure_name]
        self._structure_name_index[entry.structure_name] = entry.entry_id

    # Update entry data
    del self._file[entry.entry_id]
    entry_group = self._file.create_group(entry.entry_id)
    entry_group.create_dataset("rotation", data=entry.rotation_matrix.to_array())
    entry_group.attrs["structure_name"] = entry.structure_name

    # Store metadata if available
    if entry.metadata:
        for key, value in entry.metadata.items():
            entry_group.attrs[key] = value

    # Save index
    self._save_index()

options: show_root_heading: true show_root_toc_entry: false

Alignment Utilities

Utility functions used within the alignment module.

align_structure_database(structure_file, foldseek_db_path, foldseek_command='foldseek', min_probability=0.5, target_db_id=None)

Calculate the alignment result for structural alignment using FoldSeek.

Parameters:
  • structure_file (Path) –

    Path to input protein structure file (PDB/mmCIF).

  • foldseek_db_path (Path) –

    Path to FoldSeek-specific database.

  • foldseek_command (str, default: 'foldseek' ) –

    FoldSeek executable name/path.

  • min_probability (float, default: 0.5 ) –

    Minimum alignment probability threshold.

  • target_db_id (Optional[str], default: None ) –

    If provided, force alignment to this specific FoldSeek target ID.

Returns:
  • AlignmentResult( AlignmentResult ) –

    Result containing alignment details and rotation matrix.

Raises:
Source code in src/flatprot/alignment/utils.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def align_structure_database(
    structure_file: Path,
    foldseek_db_path: Path,
    foldseek_command: str = "foldseek",
    min_probability: float = 0.5,
    target_db_id: Optional[str] = None,
) -> AlignmentResult:
    """Calculate the alignment result for structural alignment using FoldSeek.

    Args:
        structure_file: Path to input protein structure file (PDB/mmCIF).
        foldseek_db_path: Path to FoldSeek-specific database.
        foldseek_command: FoldSeek executable name/path.
        min_probability: Minimum alignment probability threshold.
        target_db_id: If provided, force alignment to this specific FoldSeek target ID.

    Returns:
        AlignmentResult: Result containing alignment details and rotation matrix.

    Raises:
        AlignmentError: General alignment failures
        NoSignificantAlignmentError: No alignment meets probability threshold
    """
    aligner = FoldseekAligner(
        foldseek_executable=foldseek_command, database_path=foldseek_db_path
    )

    alignment_result = aligner.align_structure(
        structure_path=structure_file,
        min_probability=min_probability,
        fixed_alignment_id=target_db_id,
    )

    if alignment_result is None:
        raise NoSignificantAlignmentError(
            f"No alignment found above {min_probability} probability threshold"
        )

    return alignment_result

get_aligned_rotation_database(alignment, db, id_transform=_foldseek_id_to_db_id)

Combines alignment rotation with database rotation.

Parameters:
  • alignment (AlignmentResult) –

    Alignment result from align_structure_database

  • db (AlignmentDatabase) –

    Initialized AlignmentDatabase instance

  • id_transform (Callable[[str], str], default: _foldseek_id_to_db_id ) –

    Function to transform FoldSeek IDs to database IDs

Returns:
Raises:
Source code in src/flatprot/alignment/utils.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def get_aligned_rotation_database(
    alignment: AlignmentResult,
    db: AlignmentDatabase,
    id_transform: Callable[[str], str] = _foldseek_id_to_db_id,
) -> tuple[TransformationMatrix, AlignmentDBEntry]:
    """Combines alignment rotation with database rotation.

    Args:
        alignment: Alignment result from align_structure_database
        db: Initialized AlignmentDatabase instance
        id_transform: Function to transform FoldSeek IDs to database IDs

    Returns:
        tuple: A tuple containing:
            - Combined transformation matrix
            - Database entry object for the matched alignment

    Raises:
        DatabaseEntryNotFoundError: If matched database entry is missing
    """
    with db:
        db_id = id_transform(alignment.db_id)
        if not db.contains_entry_id(db_id):
            raise DatabaseEntryNotFoundError(
                f"Database entry {alignment.db_id} not found"
            )

        db_entry = db.get_by_entry_id(db_id)

    alignment_transform: TransformationMatrix = alignment.rotation_matrix
    db_transform: TransformationMatrix = db_entry.rotation_matrix

    # Combine the transformations: apply query_to_target_transform first, then db_transform.
    # T_final = T_db ∘ T_inv_align
    # Since T2.before(T1) applies T1 then T2, we use:
    final_transform = db_transform.before(alignment_transform)

    return final_transform, db_entry

options: show_root_heading: true show_root_toc_entry: false

Alignment Errors

Exceptions specific to the alignment process.

AlignmentError

Bases: FlatProtError

Base class for alignment-related errors.

Source code in src/flatprot/alignment/errors.py
 8
 9
10
11
class AlignmentError(FlatProtError):
    """Base class for alignment-related errors."""

    pass

DatabaseEntryNotFoundError

Bases: AlignmentError

Raised when a database entry is not found.

Source code in src/flatprot/alignment/errors.py
20
21
22
23
class DatabaseEntryNotFoundError(AlignmentError):
    """Raised when a database entry is not found."""

    pass

NoSignificantAlignmentError

Bases: AlignmentError

Raised when no significant alignment is found.

Source code in src/flatprot/alignment/errors.py
14
15
16
17
class NoSignificantAlignmentError(AlignmentError):
    """Raised when no significant alignment is found."""

    pass

options: show_root_heading: true show_root_toc_entry: false