Browse Source

Done with reading the whole db higher-level parameters. Now let's find out how to read the actual audio content

master
hadware 6 years ago
parent
commit
d790725000
  1. 61
      db_reader.py
  2. 1
      requirements.txt

61
db_reader.py

@ -1,8 +1,10 @@
import argparse
from dataclasses import dataclass, field
from io import SEEK_SET
from pathlib import Path
from struct import unpack_from, unpack
from typing import List, Tuple, BinaryIO
from typing import List, Tuple, BinaryIO, Dict
import numpy as np
MBROLA_VOICES_FOLDER = Path("/usr/share/mbrola/")
argparser = argparse.ArgumentParser()
@ -10,6 +12,7 @@ argparser.add_argument("mbrola_db", type=Path,
help="Mrbola db name or direct path")
PhonemeCode = int
Diphone = Tuple[str,str]
def read_str(io: BinaryIO):
@ -26,14 +29,22 @@ def read_str(io: BinaryIO):
@dataclass
class DiphoneInfo:
left: PhonemeCode
right: PhonemeCode
left: str
right: str
pos_wave: int # position in SPEECH_FILE
halfseg: int # position of center of diphone
pos_pm: int # index in PITCHMARK_FILE
nb_frame: int # Number of pitch markers
@property
def left_code(self):
pass
@property
def right_code(self):
pass
@dataclass
class FrameType:
@ -56,10 +67,11 @@ class MbrolaDatabase:
max_samples: int = 0 # Size of the diphone buffer= 0 means let me manage it myself
magic_header: str = "MBROLA" # Magic header of the database
version: str = "2.06" # version of the database
info: List[str] = ""
info: List[str] = None
silence_phone: str = "_" # silence symbol in the database
pitch_marks: List[FrameType] = field(default=list)
diphone_table: Dict[Diphone, DiphoneInfo] = field(default_factory=dict)
pitch_marks: np.ndarray = None
def read_header(self, db_file: BinaryIO):
"""Reads the database header"""
@ -78,16 +90,12 @@ class MbrolaDatabase:
def read_index(self, db_file: BinaryIO):
"""Reads the index table of diphones"""""
pass
def read_pitchmarks(self, db_file: BinaryIO):
i = 0
pm_index = 0 #  cumulative position in pitch mark vector
wav_index = 0 #   cumulative position in the waveform database
while (pm_index != self.size_mark and i < self.nb_diphone):
left = read_str(db_file)
right = read_str(db_file)
while pm_index != self.size_mark and i < self.nb_diphone:
left_phone = read_str(db_file)
right_phone = read_str(db_file)
half_segment = unpack_from("<h", db_file.read(2))[0]
nb_frames = unpack_from("<B", db_file.read(1))[0]
nb_wframe = unpack_from("<B", db_file.read(1))[0]
@ -96,12 +104,35 @@ class MbrolaDatabase:
pm_index += nb_frames
if pm_index == self.size_mark:
self.silence_phone = left
self.silence_phone = left_phone
new_wav_index = wav_index
wav_index = nb_wframe * self.mbr_period
new_diph = DiphoneInfo(left=left_phone, right=right_phone,
pos_wave=new_wav_index,
halfseg=half_segment,
pos_pm=position_pm,
nb_frame=nb_frames)
self.diphone_table[(left_phone, right_phone)] = new_diph
# keep track of the phoneme with the biggest number of frames
if self.max_frame < nb_wframe:
self.max_frame = nb_wframe
i += 1
def read_pitchmarks(self, db_file: BinaryIO):
round_size = (self.size_mark + 3) // 4
self.pitch_marks = np.array(f"<{round_size}B", db_file.read(round_size))
self.raw_offset = db_file.tell()
def read_info(self, db_file: BinaryIO):
pass
db_file.seek(self.raw_offset + self.size_raw, whence=SEEK_SET)
self.info = []
while True:
try:
self.info.append(read_str(db_file))
except EOFError:
break
def read_database(self):
with open(self.db_path, "rb") as db_file:

1
requirements.txt

@ -0,0 +1 @@
numpy
Loading…
Cancel
Save