megaaziib commited on
Commit
9ac3941
1 Parent(s): 8b63acc

fixed rmvpe infer pipelines

Browse files
Files changed (1) hide show
  1. vc_infer_pipeline.py +20 -223
vc_infer_pipeline.py CHANGED
@@ -1,8 +1,6 @@
1
  import numpy as np, parselmouth, torch, pdb, sys, os
2
  from time import time as ttime
3
  import torch.nn.functional as F
4
- import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
5
- from torch import Tensor
6
  import scipy.signal as signal
7
  import pyworld, os, traceback, faiss, librosa, torchcrepe
8
  from scipy import signal
@@ -71,186 +69,6 @@ class VC(object):
71
  self.t_max = self.sr * self.x_max # 免查询时长阈值
72
  self.device = config.device
73
 
74
- # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
75
- def get_optimal_torch_device(self, index: int = 0) -> torch.device:
76
- # Get cuda device
77
- if torch.cuda.is_available():
78
- return torch.device(
79
- f"cuda:{index % torch.cuda.device_count()}"
80
- ) # Very fast
81
- elif torch.backends.mps.is_available():
82
- return torch.device("mps")
83
- # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
84
- # Else wise return the "cpu" as a torch device,
85
- return torch.device("cpu")
86
-
87
- # Fork Feature: Compute f0 with the crepe method
88
- def get_f0_crepe_computation(
89
- self,
90
- x,
91
- f0_min,
92
- f0_max,
93
- p_len,
94
- hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
95
- model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
96
- ):
97
- x = x.astype(
98
- np.float32
99
- ) # fixes the F.conv2D exception. We needed to convert double to float.
100
- x /= np.quantile(np.abs(x), 0.999)
101
- torch_device = self.get_optimal_torch_device()
102
- audio = torch.from_numpy(x).to(torch_device, copy=True)
103
- audio = torch.unsqueeze(audio, dim=0)
104
- if audio.ndim == 2 and audio.shape[0] > 1:
105
- audio = torch.mean(audio, dim=0, keepdim=True).detach()
106
- audio = audio.detach()
107
- print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
108
- pitch: Tensor = torchcrepe.predict(
109
- audio,
110
- self.sr,
111
- hop_length,
112
- f0_min,
113
- f0_max,
114
- model,
115
- batch_size=hop_length * 2,
116
- device=torch_device,
117
- pad=True,
118
- )
119
- p_len = p_len or x.shape[0] // hop_length
120
- # Resize the pitch for final f0
121
- source = np.array(pitch.squeeze(0).cpu().float().numpy())
122
- source[source < 0.001] = np.nan
123
- target = np.interp(
124
- np.arange(0, len(source) * p_len, len(source)) / p_len,
125
- np.arange(0, len(source)),
126
- source,
127
- )
128
- f0 = np.nan_to_num(target)
129
- return f0 # Resized f0
130
-
131
- def get_f0_official_crepe_computation(
132
- self,
133
- x,
134
- f0_min,
135
- f0_max,
136
- model="full",
137
- ):
138
- # Pick a batch size that doesn't cause memory errors on your gpu
139
- batch_size = 512
140
- # Compute pitch using first gpu
141
- audio = torch.tensor(np.copy(x))[None].float()
142
- f0, pd = torchcrepe.predict(
143
- audio,
144
- self.sr,
145
- self.window,
146
- f0_min,
147
- f0_max,
148
- model,
149
- batch_size=batch_size,
150
- device=self.device,
151
- return_periodicity=True,
152
- )
153
- pd = torchcrepe.filter.median(pd, 3)
154
- f0 = torchcrepe.filter.mean(f0, 3)
155
- f0[pd < 0.1] = 0
156
- f0 = f0[0].cpu().numpy()
157
- return f0
158
-
159
- # Fork Feature: Compute pYIN f0 method
160
- def get_f0_pyin_computation(self, x, f0_min, f0_max):
161
- y, sr = librosa.load("saudio/Sidney.wav", self.sr, mono=True)
162
- f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
163
- f0 = f0[1:] # Get rid of extra first frame
164
- return f0
165
-
166
- # Fork Feature: Acquire median hybrid f0 estimation calculation
167
- def get_f0_hybrid_computation(
168
- self,
169
- methods_str,
170
- input_audio_path,
171
- x,
172
- f0_min,
173
- f0_max,
174
- p_len,
175
- filter_radius,
176
- crepe_hop_length,
177
- time_step,
178
- ):
179
- # Get various f0 methods from input to use in the computation stack
180
- s = methods_str
181
- s = s.split("hybrid")[1]
182
- s = s.replace("[", "").replace("]", "")
183
- methods = s.split("+")
184
- f0_computation_stack = []
185
-
186
- print("Calculating f0 pitch estimations for methods: %s" % str(methods))
187
- x = x.astype(np.float32)
188
- x /= np.quantile(np.abs(x), 0.999)
189
- # Get f0 calculations for all methods specified
190
- for method in methods:
191
- f0 = None
192
- if method == "pm":
193
- f0 = (
194
- parselmouth.Sound(x, self.sr)
195
- .to_pitch_ac(
196
- time_step=time_step / 1000,
197
- voicing_threshold=0.6,
198
- pitch_floor=f0_min,
199
- pitch_ceiling=f0_max,
200
- )
201
- .selected_array["frequency"]
202
- )
203
- pad_size = (p_len - len(f0) + 1) // 2
204
- if pad_size > 0 or p_len - len(f0) - pad_size > 0:
205
- f0 = np.pad(
206
- f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
207
- )
208
- elif method == "crepe":
209
- f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
210
- f0 = f0[1:] # Get rid of extra first frame
211
- elif method == "crepe-tiny":
212
- f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
213
- f0 = f0[1:] # Get rid of extra first frame
214
- elif method == "mangio-crepe":
215
- f0 = self.get_f0_crepe_computation(
216
- x, f0_min, f0_max, p_len, crepe_hop_length
217
- )
218
- elif method == "mangio-crepe-tiny":
219
- f0 = self.get_f0_crepe_computation(
220
- x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
221
- )
222
- elif method == "harvest":
223
- f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
224
- if filter_radius > 2:
225
- f0 = signal.medfilt(f0, 3)
226
- f0 = f0[1:] # Get rid of first frame.
227
- elif method == "dio": # Potentially buggy?
228
- f0, t = pyworld.dio(
229
- x.astype(np.double),
230
- fs=self.sr,
231
- f0_ceil=f0_max,
232
- f0_floor=f0_min,
233
- frame_period=10,
234
- )
235
- f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
236
- f0 = signal.medfilt(f0, 3)
237
- f0 = f0[1:]
238
- # elif method == "pyin": Not Working just yet
239
- # f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
240
- # Push method to the stack
241
- f0_computation_stack.append(f0)
242
-
243
- for fc in f0_computation_stack:
244
- print(len(fc))
245
-
246
- print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
247
- f0_median_hybrid = None
248
- if len(f0_computation_stack) == 1:
249
- f0_median_hybrid = f0_computation_stack[0]
250
- else:
251
- f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
252
- return f0_median_hybrid
253
-
254
  def get_f0(
255
  self,
256
  input_audio_path,
@@ -259,7 +77,6 @@ class VC(object):
259
  f0_up_key,
260
  f0_method,
261
  filter_radius,
262
- crepe_hop_length,
263
  inp_f0=None,
264
  ):
265
  global input_audio_path2wav
@@ -289,28 +106,27 @@ class VC(object):
289
  f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
290
  if filter_radius > 2:
291
  f0 = signal.medfilt(f0, 3)
292
- elif f0_method == "dio": # Potentially Buggy?
293
- f0, t = pyworld.dio(
294
- x.astype(np.double),
295
- fs=self.sr,
296
- f0_ceil=f0_max,
297
- f0_floor=f0_min,
298
- frame_period=10,
299
- )
300
- f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
301
- f0 = signal.medfilt(f0, 3)
302
  elif f0_method == "crepe":
303
- f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
304
- elif f0_method == "crepe-tiny":
305
- f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
306
- elif f0_method == "mangio-crepe":
307
- f0 = self.get_f0_crepe_computation(
308
- x, f0_min, f0_max, p_len, crepe_hop_length
309
- )
310
- elif f0_method == "mangio-crepe-tiny":
311
- f0 = self.get_f0_crepe_computation(
312
- x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
 
 
 
 
 
313
  )
 
 
 
 
314
  elif f0_method == "rmvpe":
315
  if hasattr(self, "model_rmvpe") == False:
316
  from rmvpe import RMVPE
@@ -320,22 +136,6 @@ class VC(object):
320
  "rmvpe.pt", is_half=self.is_half, device=self.device
321
  )
322
  f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
323
-
324
- elif "hybrid" in f0_method:
325
- # Perform hybrid median pitch estimation
326
- input_audio_path2wav[input_audio_path] = x.astype(np.double)
327
- f0 = self.get_f0_hybrid_computation(
328
- f0_method,
329
- input_audio_path,
330
- x,
331
- f0_min,
332
- f0_max,
333
- p_len,
334
- filter_radius,
335
- crepe_hop_length,
336
- time_step,
337
- )
338
-
339
  f0 *= pow(2, f0_up_key / 12)
340
  # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
341
  tf0 = self.sr // self.window # 每秒f0点数
@@ -359,7 +159,6 @@ class VC(object):
359
  f0_mel[f0_mel <= 1] = 1
360
  f0_mel[f0_mel > 255] = 255
361
  f0_coarse = np.rint(f0_mel).astype(np.int)
362
-
363
  return f0_coarse, f0bak # 1-0
364
 
365
  def vc(
@@ -484,7 +283,6 @@ class VC(object):
484
  rms_mix_rate,
485
  version,
486
  protect,
487
- crepe_hop_length,
488
  f0_file=None,
489
  ):
490
  if (
@@ -546,7 +344,6 @@ class VC(object):
546
  f0_up_key,
547
  f0_method,
548
  filter_radius,
549
- crepe_hop_length,
550
  inp_f0,
551
  )
552
  pitch = pitch[:p_len]
@@ -643,4 +440,4 @@ class VC(object):
643
  del pitch, pitchf, sid
644
  if torch.cuda.is_available():
645
  torch.cuda.empty_cache()
646
- return audio_opt
 
1
  import numpy as np, parselmouth, torch, pdb, sys, os
2
  from time import time as ttime
3
  import torch.nn.functional as F
 
 
4
  import scipy.signal as signal
5
  import pyworld, os, traceback, faiss, librosa, torchcrepe
6
  from scipy import signal
 
69
  self.t_max = self.sr * self.x_max # 免查询时长阈值
70
  self.device = config.device
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def get_f0(
73
  self,
74
  input_audio_path,
 
77
  f0_up_key,
78
  f0_method,
79
  filter_radius,
 
80
  inp_f0=None,
81
  ):
82
  global input_audio_path2wav
 
106
  f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
107
  if filter_radius > 2:
108
  f0 = signal.medfilt(f0, 3)
 
 
 
 
 
 
 
 
 
 
109
  elif f0_method == "crepe":
110
+ model = "full"
111
+ # Pick a batch size that doesn't cause memory errors on your gpu
112
+ batch_size = 512
113
+ # Compute pitch using first gpu
114
+ audio = torch.tensor(np.copy(x))[None].float()
115
+ f0, pd = torchcrepe.predict(
116
+ audio,
117
+ self.sr,
118
+ self.window,
119
+ f0_min,
120
+ f0_max,
121
+ model,
122
+ batch_size=batch_size,
123
+ device=self.device,
124
+ return_periodicity=True,
125
  )
126
+ pd = torchcrepe.filter.median(pd, 3)
127
+ f0 = torchcrepe.filter.mean(f0, 3)
128
+ f0[pd < 0.1] = 0
129
+ f0 = f0[0].cpu().numpy()
130
  elif f0_method == "rmvpe":
131
  if hasattr(self, "model_rmvpe") == False:
132
  from rmvpe import RMVPE
 
136
  "rmvpe.pt", is_half=self.is_half, device=self.device
137
  )
138
  f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  f0 *= pow(2, f0_up_key / 12)
140
  # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
141
  tf0 = self.sr // self.window # 每秒f0点数
 
159
  f0_mel[f0_mel <= 1] = 1
160
  f0_mel[f0_mel > 255] = 255
161
  f0_coarse = np.rint(f0_mel).astype(np.int)
 
162
  return f0_coarse, f0bak # 1-0
163
 
164
  def vc(
 
283
  rms_mix_rate,
284
  version,
285
  protect,
 
286
  f0_file=None,
287
  ):
288
  if (
 
344
  f0_up_key,
345
  f0_method,
346
  filter_radius,
 
347
  inp_f0,
348
  )
349
  pitch = pitch[:p_len]
 
440
  del pitch, pitchf, sid
441
  if torch.cuda.is_available():
442
  torch.cuda.empty_cache()
443
+ return audio_opt