Skip to content

Remove the square frame from the face #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
5 changes: 4 additions & 1 deletion lipsync/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from lipsync.lipsync import LipSync
"""
The lipsync package initialization.
"""

from lipsync.lipsync import LipSync

__all__ = [
'LipSync',
Expand Down
52 changes: 38 additions & 14 deletions lipsync/audio.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,40 @@
"""
Audio processing utilities for lipsync.

Includes loading WAV files, applying preemphasis,
and computing mel-spectrograms with optional normalization.
"""

import librosa
import librosa.filters
import numpy as np
from scipy import signal

from lipsync.hparams import HParams

hp = HParams()


def load_wav(path: str, sr: int) -> np.ndarray:
"""Load a WAV file.
"""
Load a WAV file using librosa.

Args:
path (str): Path to the WAV file.
sr (int): Sampling rate to load the audio at.

Returns:
np.ndarray: Audio time series as a 1D numpy array.
np.ndarray: Audio time series as a 1D NumPy array.
"""
return librosa.core.load(path, sr=sr)[0]


def preemphasis_func(wav: np.ndarray, k: float, preemphasize: bool = True) -> np.ndarray:
"""Apply a preemphasis filter to the waveform.
"""
Apply a preemphasis filter to the waveform.

Args:
wav (np.ndarray): Input waveform as a 1D numpy array.
wav (np.ndarray): Input waveform as a 1D NumPy array.
k (float): Preemphasis coefficient.
preemphasize (bool): Whether to apply preemphasis or not.

Expand All @@ -35,17 +45,19 @@ def preemphasis_func(wav: np.ndarray, k: float, preemphasize: bool = True) -> np
# This increases the magnitude of high-frequency components.
if preemphasize:
return signal.lfilter([1, -k], [1], wav)

return wav


def melspectrogram(wav: np.ndarray) -> np.ndarray:
"""Compute the mel-spectrogram of a waveform.
"""
Compute the mel-spectrogram of a waveform.

Args:
wav (np.ndarray): Input waveform array.

Returns:
np.ndarray: Mel-spectrogram as a 2D numpy array (num_mels x time).
np.ndarray: Mel-spectrogram as a 2D NumPy array (num_mels x time).
"""
D = _stft(preemphasis_func(wav, hp.preemphasis, hp.preemphasize))
S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db
Expand All @@ -56,19 +68,26 @@ def melspectrogram(wav: np.ndarray) -> np.ndarray:


def _stft(y: np.ndarray) -> np.ndarray:
"""Compute the STFT of the given waveform.
"""
Compute the STFT of the given waveform.

Args:
y (np.ndarray): Input waveform.

Returns:
np.ndarray: Complex STFT of y. Shape is (1 + n_fft/2, time).
np.ndarray: Complex STFT of shape (1 + n_fft//2, time).
"""
return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_size, win_length=hp.win_size)
return librosa.stft(
y=y,
n_fft=hp.n_fft,
hop_length=hp.hop_size,
win_length=hp.win_size
)


def _linear_to_mel(spectrogram: np.ndarray) -> np.ndarray:
"""Convert a linear-scale spectrogram to mel-scale.
"""
Convert a linear-scale spectrogram to mel-scale.

Args:
spectrogram (np.ndarray): Linear frequency spectrogram.
Expand All @@ -81,7 +100,8 @@ def _linear_to_mel(spectrogram: np.ndarray) -> np.ndarray:


def _build_mel_basis() -> np.ndarray:
"""Construct a mel-filter bank.
"""
Construct a mel filter bank.

Returns:
np.ndarray: Mel filter bank matrix.
Expand All @@ -97,7 +117,8 @@ def _build_mel_basis() -> np.ndarray:


def _amp_to_db(x: np.ndarray) -> np.ndarray:
"""Convert amplitude to decibels.
"""
Convert amplitude to decibels.

Args:
x (np.ndarray): Amplitude values.
Expand All @@ -110,7 +131,8 @@ def _amp_to_db(x: np.ndarray) -> np.ndarray:


def _normalize(spec: np.ndarray) -> np.ndarray:
"""Normalize the mel-spectrogram.
"""
Normalize the mel-spectrogram.

Args:
spec (np.ndarray): Decibel-scaled mel-spectrogram.
Expand Down Expand Up @@ -139,7 +161,9 @@ def _normalize(spec: np.ndarray) -> np.ndarray:

if hp.symmetric_mels:
# Symmetric range normalization
return (2 * hp.max_abs_value) * ((spec - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value
return (2 * hp.max_abs_value) * (
(spec - hp.min_level_db) / (-hp.min_level_db)
) - hp.max_abs_value
else:
# Asymmetric range normalization
return hp.max_abs_value * ((spec - hp.min_level_db) / (-hp.min_level_db))
14 changes: 10 additions & 4 deletions lipsync/helpers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
"""
Helper utilities for lipsync, including video frame reading and face bounding box extraction.
"""

import av
import numpy as np
from typing import Tuple, List
Expand Down Expand Up @@ -45,28 +49,30 @@ def read_frames(face: str) -> Tuple[List[np.ndarray], int]:
raise ValueError(f"An error occurred while reading the video file: {e}") from e


def get_face_box(landmarks: list) -> Tuple[int, int, int, int]:
def get_face_box(landmarks: list, face_index: int = 0) -> Tuple[int, int, int, int]:
"""
Extracts and returns the bounding box coordinates of a detected face.

Args:
landmarks (list): A list containing facial landmarks where the third element
(index 2) represents the bounding box coordinates.
(index 2) represents the bounding box coordinates.
face_index (int, optional): The index of the face to extract bounding box coordinates.

Returns:
Tuple[int, int, int, int]: The bounding box coordinates (x1, y1, x2, y2) of the face.

Raises:
ValueError: If the landmarks list is improperly structured or does not contain
the expected bounding box.
the expected bounding box.
"""
try:
# Extract the face box from the landmarks
face_box = landmarks[2][0] # Access the bounding box coordinates
face_box = landmarks[2][face_index] # Access the bounding box coordinates
face_box = np.clip(face_box, 0, None) # Ensure no negative values

# Convert bounding box values to integers
x1, y1, x2, y2 = map(int, face_box[:-1]) # Exclude the confidence score (last value)
return x1, y1, x2, y2

except (IndexError, TypeError, ValueError) as e:
raise ValueError("Invalid landmarks structure. Could not extract face box.") from e
Loading