Skip to content

Commit efd7830

Browse files
authored
Merge pull request #4 from KAIST-MACLab/develop
update to 0.2
2 parents 6b7f76a + 3f2366a commit efd7830

25 files changed

+1004
-139
lines changed

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,5 @@
1-
poetry.lock
1+
poetry.lock
2+
dist
3+
.vscode
4+
.ipynb_checkpoints
5+
__pycache__

.travis.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
language: python
2+
python:
3+
- "3.6"
4+
- "3.7"
5+
- "3.8"
6+
before_install:
7+
- pip install poetry
8+
install:
9+
- poetry install
10+
script:
11+
- pytest

LICENSE

Lines changed: 674 additions & 0 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
PyTSMod
22
===================
3+
[![PyPI](https://img.shields.io/pypi/v/pytsmod.svg)](https://pypi.python.org/pypi/pytsmod)
4+
[![Build Status](https://travis-ci.org/KAIST-MACLab/PyTSMod.svg?branch=master)](https://travis-ci.org/KAIST-MACLab/PyTSMod)
35

46
PyTSMod is a open-source library for Time-Scale Modification algorithms in Python 3. PyTSMod contains basic TSM algorithms such as Overlap-Add (OLA), Waveform-Similarity Overlap-Add (WSOLA), Time-Domain Pitch-Synchronous Overlap-Add (TD-PSOLA), and Phase Vocoder (PV-TSM). We are also planning to add more TSM algorithms and pitch shifting algorithms.
57

@@ -34,11 +36,11 @@ $ poetry build
3436

3537
### Requirements
3638

37-
To use PyTSMod, following packages are required.
38-
- NumPy (>=1.13.0)
39+
To use PyTSMod, Python with version >= 3.6 and following packages are required.
40+
- NumPy (>=1.16.0)
3941
- SciPy (>=1.0.0)
40-
- libROSA
41-
- soundfile
42+
- libROSA (>=0.8.0)
43+
- soundfile (>=0.10.0)
4244

4345
## Using PyTSMod
4446

@@ -48,7 +50,7 @@ OLA, WSOLA, and PV-TSM can be imported as module to be used directly in Python.
4850

4951
```python
5052
import numpy as np
51-
import PyTSMod as tsm
53+
import pytsmod as tsm
5254
import soundfile as sf # you can use other audio load packages.
5355

5456
x, sr = sf.read('/FILEPATH/AUDIOFILE.wav')
@@ -73,7 +75,7 @@ When using TD-PSOLA, the estimated pitch information of the source you want to m
7375

7476
```python
7577
import numpy as np
76-
import PyTSMod as tsm
78+
import pytsmod as tsm
7779
import crepe # you can use other pitch tracking algorithms.
7880
import soundfile as sf # you can use other audio load packages.
7981

pyproject.toml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
[tool.poetry]
22
name = "pytsmod"
3-
version = "0.1.1"
3+
version = "0.2.0"
44
description = ""
55
authors = ["Sangeon Yong <koragon2@kaist.ac.kr>"]
66

77
[tool.poetry.dependencies]
88
python = "^3.6"
9-
numpy = "^1.19.2"
10-
nptyping = "^1.3.0"
11-
data-science-types = "^0.2.17"
9+
numpy = "^1.16.0"
10+
scipy = "^1.0.0"
11+
soundfile = "^0.10.0"
12+
librosa = "^0.8"
1213

1314
[tool.poetry.dev-dependencies]
1415
pytest = "^5.2"

pytsmod/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = '0.1.0'
1+
__version__ = '0.2.0'
22

33
from .tdpsolatsm import *
44
from .wsolatsm import *

pytsmod/pvtsm.py

Lines changed: 13 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import numpy as np
2-
from .utils import win as stft, istft
32
from scipy.interpolate import interp1d
3+
from .utils import stft, istft, _validate_audio, _validate_scale_factor
44

55

66
def phase_vocoder(x, s, win_type='sin', win_size=2048, syn_hop_size=512,
@@ -41,25 +41,12 @@ def phase_vocoder(x, s, win_type='sin', win_size=2048, syn_hop_size=512,
4141
y : numpy.ndarray [shape=(channel, num_samples) or (num_samples)]
4242
the modified output audio sequence.
4343
"""
44-
45-
if x.ndim == 1: # make mono source to 2D array with a single row.
46-
x = np.expand_dims(x, 0)
47-
else:
48-
raise Exception("Please use the valid audio source. "
49-
+ "Number of dimension of input should be less than 3.")
50-
51-
if np.isscalar(s):
52-
anc_points = np.array([[0, np.shape(x)[1] - 1],
53-
[0, np.ceil(s * np.shape(x)[1]) - 1]])
54-
elif s.shape[1] == 2:
55-
anc_points = s
56-
else:
57-
raise Exception('Please use the valid anchorPoints. '
58-
+ '(scalar or pair of input/output sample points)')
59-
60-
output_length = int(anc_points[-1, -1]) + 1
44+
# validate the input audio and scale factor.
45+
x = _validate_audio(x)
46+
anc_points = _validate_scale_factor(x, s)
6147

6248
n_chan = x.shape[0]
49+
output_length = int(anc_points[-1, -1]) + 1
6350

6451
sw_pos = np.arange(0, output_length + win_size // 2, syn_hop_size)
6552
ana_interpolated = interp1d(anc_points[1, :], anc_points[0, :],
@@ -153,28 +140,21 @@ def phase_vocoder_int(x, s, win_type='hann', win_size=2048, syn_hop_size=512,
153140
y : numpy.ndarray [shape=(channel, num_samples) or (num_samples)]
154141
the modified output audio sequence.
155142
"""
156-
157-
if zero_pad is None:
158-
zero_pad = s * win_size // 2
159-
160-
if x.ndim == 1: # make mono source to 2D array with a single row.
161-
x = np.expand_dims(x, 0)
162-
elif x.ndim > 2:
163-
raise Exception("Please use the valid audio source. "
164-
+ "Number of dimension of input should be less than 3.")
165-
143+
# validate the input audio and scale factor.
144+
x = _validate_audio(x)
166145
if np.isscalar(s) and isinstance(s, int) and s >= 1:
167146
anchor_points = np.array([[0, np.shape(x)[1] - 1],
168147
[0, np.ceil(s * np.shape(x)[1]) - 1]])
169148
else:
170149
raise Exception("Please use the valid stretching rate. "
171150
+ "(integer stretching factors larger than 0)")
172151

173-
output_length = int(anchor_points[-1, -1]) + 1
152+
if zero_pad is None:
153+
zero_pad = s * win_size // 2
174154

175-
win_size_half = int(np.round(win_size / 2))
155+
output_length = int(anchor_points[-1, -1]) + 1
176156

177-
out_win_pos = np.arange(0, output_length + win_size_half, syn_hop_size)
157+
out_win_pos = np.arange(0, output_length + win_size // 2, syn_hop_size)
178158
in_win_pos = ((out_win_pos - 1) / s + 1).astype(int)
179159

180160
n_channels = x.shape[0]
@@ -227,10 +207,10 @@ def _find_peaks(spec):
227207
return peaks, np.empty(0)
228208

229209
# Find region of influence. Axis 0 represents start and end each.
230-
infl_region = np.zeros(2, peaks.shape)
210+
infl_region = np.zeros((2, peaks.size))
231211
infl_region[0, 0] = 0
232212
infl_region[0, 1:] = np.ceil((peaks[1:] + peaks[: -1]) / 2)
233213
infl_region[1, : -1] = infl_region[0, 1:] - 1
234-
infl_region[1, -1] = infl_region.shape[-1]
214+
infl_region[1, -1] = spec.size - 1
235215

236216
return peaks, infl_region.astype(int)

pytsmod/tdpsolatsm.py

Lines changed: 31 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import numpy as np
22

33
from .utils import win as win_func
4+
from .utils import _validate_audio, _validate_f0
45

56

67
def tdpsola(x, sr, src_f0, tgt_f0=None, alpha=1, beta=None,
@@ -38,85 +39,50 @@ def tdpsola(x, sr, src_f0, tgt_f0=None, alpha=1, beta=None,
3839
y : numpy.ndarray [shape=(channel, num_samples) or (num_samples)]
3940
the modified output audio sequence.
4041
"""
41-
42-
if x.ndim == 1: # make mono source to 2D array with a single row.
43-
x = np.expand_dims(x, 0)
44-
elif x.ndim > 2:
45-
raise Exception("Please use the valid audio source. "
46-
+ "Number of dimension of input should be less than 3.")
47-
48-
if src_f0.ndim == 1:
49-
src_f0 = np.expand_dims(src_f0, 0)
50-
elif src_f0.ndim == 2:
51-
if x.shape[0] != src_f0.shape[0] and src_f0.shape[0] != 1:
52-
raise Exception("The number of channels of source f0 value "
53-
+ "should 1 or same as the source.")
54-
elif src_f0.ndim > 2:
55-
raise Exception("Please use the valid source f0 value. "
56-
+ "Number of dimension of source f0 "
57-
+ "should be less than 3.")
58-
59-
# Check if system uses target_f0 or beta.
60-
if (tgt_f0 is None) and beta is None:
42+
# validate the input audio, input pitch and scale factor.
43+
x = _validate_audio(x)
44+
src_f0 = _validate_f0(x, src_f0)
45+
if tgt_f0 is not None:
46+
if beta is not None:
47+
raise Exception("You cannot use both tgt_f0 and beta as an input.")
48+
tgt_f0 = _validate_f0(x, tgt_f0)
49+
elif beta is None:
6150
beta = 1
62-
elif beta is None: # Uses target_f0
63-
if tgt_f0.ndim == 1:
64-
tgt_f0 = np.expand_dims(tgt_f0, 0)
65-
elif tgt_f0.ndim == 2:
66-
if x.shape[0] != tgt_f0.shape[0] and tgt_f0.shape[0] != 1:
67-
raise Exception("The number of channels of target f0 value "
68-
+ "should 1 or same as the source.")
69-
elif tgt_f0.ndim > 2:
70-
raise Exception("Please use the valid target f0 value. "
71-
+ "Number of dimension of target f0 "
72-
+ "should be less than 3.")
73-
elif (tgt_f0 is not None) and (beta is not None):
74-
raise Exception("You cannot use both target_f0 and beta as an input.")
75-
elif not np.isscalar(beta):
76-
raise Exception("The beta value should be a scalar.")
77-
78-
n_channels = x.shape[0]
51+
52+
n_chan = x.shape[0]
7953
output_length = int(np.ceil(x.shape[1] * alpha))
80-
y = np.zeros((n_channels, output_length))
54+
y = np.zeros((n_chan, output_length))
8155

8256
for c, x_chan in enumerate(x):
83-
if src_f0.ndim == 1:
84-
src_f0_chan = src_f0
85-
else:
86-
src_f0_chan = src_f0[c]
87-
57+
src_f0_chan = src_f0[c]
8858
src_f0_chan[np.isnan(src_f0_chan)] = 0
8959
pm_chan = _find_pitch_marks(x_chan, sr, src_f0_chan, p_hop_size,
9060
p_win_size)
91-
92-
if tgt_f0 is not None:
93-
if tgt_f0.ndim == 1:
94-
tgt_f0_chan = tgt_f0
95-
else:
96-
tgt_f0_chan = tgt_f0[c]
97-
beta = _target_f0_to_beta(x_chan, pm_chan,
98-
src_f0_chan, tgt_f0_chan)
99-
10061
pitch_period = np.diff(pm_chan) # compute pitch periods
10162

102-
# make beta to an array if beta is a fixed value.
103-
if np.isscalar(beta):
104-
beta = np.ones(pitch_period.size) * beta
63+
if tgt_f0 is not None:
64+
tgt_f0_chan = tgt_f0[c]
65+
beta_seq = _target_f0_to_beta(x_chan, pm_chan,
66+
src_f0_chan, tgt_f0_chan)
67+
else:
68+
beta_seq = np.ones(pitch_period.size) * beta
10569

10670
if pm_chan[0] <= pitch_period[0]: # remove first pitch mark
10771
pm_chan = pm_chan[1:]
10872
pitch_period = pitch_period[1:]
109-
beta = beta[1:]
73+
beta_seq = beta_seq[1:]
11074

11175
if pm_chan[-1] + pitch_period[-1] > x_chan.size: # remove last pitch mark
11276
pm_chan = pm_chan[: -1]
11377
else:
11478
pitch_period = np.append(pitch_period, pitch_period[-1])
115-
beta = np.append(beta, beta[-1])
79+
beta_seq = np.append(beta_seq, beta_seq[-1])
80+
81+
output_length = int(np.ceil(x_chan.size * alpha))
11682

117-
output_length = int(np.ceil(x.size * alpha))
118-
x_chan = np.pad(x_chan, (1024, 1024))
119-
y_chan = np.zeros(output_length + 2 * 1024) # output signal
83+
pad = int(np.ceil(sr / 100))
84+
x_chan = np.pad(x_chan, (pad, pad))
85+
y_chan = np.zeros(output_length + 2 * pad) # output signal
12086

12187
tk = pitch_period[0] + 1 # output pitch mark
12288
ow = np.zeros(y_chan.shape)
@@ -131,19 +97,19 @@ def tdpsola(x, sr, src_f0, tgt_f0=None, alpha=1, beta=None,
13197
st = pm_chan[i] - pit
13298
en = pm_chan[i] + pit
13399

134-
gr = x_chan[st + 1024: en + 1024 + 1] * win
100+
gr = x_chan[st + pad: en + pad + 1] * win
135101

136-
ini_gr = round(tk) - pit + 1024
137-
end_gr = round(tk) + pit + 1024
102+
ini_gr = round(tk) - pit + pad
103+
end_gr = round(tk) + pit + pad
138104

139105
y_chan[ini_gr: end_gr + 1] = y_chan[ini_gr: end_gr + 1] + gr
140106
ow[ini_gr: end_gr + 1] = ow[ini_gr: end_gr + 1] + win
141-
tk = tk + pit / beta[i]
107+
tk = tk + pit / beta_seq[i]
142108

143109
ow[ow < 1e-3] = 1
144110

145111
y_chan = y_chan / ow
146-
y_chan = y_chan[1024:]
112+
y_chan = y_chan[pad:]
147113
y_chan = y_chan[: output_length]
148114
y[c, :] = y_chan
149115

pytsmod/utils/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
from .stft import *
22
from .win import *
3+
from .validate import _validate_audio, _validate_scale_factor, _validate_f0

pytsmod/utils/stft.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -161,17 +161,16 @@ def lsee_mstft(X, syn_hop, win_type, win_size, zero_pad, fft_shift,
161161
w = win_func(win_type, win_size, zero_pad)
162162

163163
win_len = len(w)
164-
win_len_half = round(win_len / 2)
165-
num_of_frames = X.shape[1]
166-
win_pos = np.arange(num_of_frames) * syn_hop
164+
n_frames = X.shape[1]
165+
win_pos = np.arange(n_frames) * syn_hop
167166
signal_length = win_pos[-1] + win_len
168167

169168
x = np.zeros(signal_length)
170169
ow = np.zeros(signal_length)
171-
for i in range(num_of_frames):
170+
for i in range(n_frames):
172171
curr_spec = X[:, i]
173172

174-
Xi = np.concatenate((curr_spec, np.flip(np.conj(curr_spec[1:-1]))))
173+
Xi = np.append(curr_spec, np.flip(np.conj(curr_spec[1:-1])))
175174
xi = np.real(np.fft.ifft(Xi))
176175
if fft_shift:
177176
xi = np.fft.fftshift(xi)
@@ -183,13 +182,13 @@ def lsee_mstft(X, syn_hop, win_type, win_size, zero_pad, fft_shift,
183182
xiw_energy = np.sum(abs(xiw))
184183
xiw = xiw * (xi_energy / (xiw_energy + np.finfo(np.float).eps))
185184

186-
x[win_pos[i]: win_pos[i] + win_len] = x[win_pos[i]: win_pos[i] + win_len] + xiw
185+
x[win_pos[i]: win_pos[i] + win_len] += xiw
187186

188-
ow[win_pos[i]: win_pos[i] + win_len] = ow[win_pos[i]: win_pos[i] + win_len] + np.power(w, 2)
187+
ow[win_pos[i]: win_pos[i] + win_len] += np.power(w, 2)
189188

190-
ow[ow < pow(10, -3)] = 1 # avoid potential division by zero
189+
ow[ow < 1e-3] = 1
191190
x = x / ow
192191

193-
x = x[win_len_half: - win_len_half + 1]
192+
x = x[win_len // 2: - win_len // 2]
194193

195194
return x

0 commit comments

Comments
 (0)