1
+ import numpy as np
2
+ import librosa
3
+ import librosa .display
4
+ import noisereduce as nr
5
+
6
+ class AudioPreprocessor :
7
+ @staticmethod
8
+ def int_to_float (array , type = np .float32 ):
9
+ """
10
+ Change np.array int16 into np.float32
11
+ Parameters
12
+ ----------
13
+ array: np.array
14
+ type: np.float32
15
+ Returns
16
+ -------
17
+ result : np.array
18
+ """
19
+
20
+ if array .dtype == type :
21
+ return array
22
+
23
+ if array .dtype not in [np .float16 , np .float32 , np .float64 ]:
24
+ if np .max (np .abs (array )) == 0 :
25
+ array = array .astype (np .float32 )
26
+ array [:] = 0
27
+ else :
28
+ array = array .astype (np .float32 ) / np .max (np .abs (array ))
29
+
30
+ return array
31
+
32
+ @staticmethod
33
+ def float_to_int (array , type = np .int16 , divide_max_abs = True ):
34
+ """
35
+ Change np.array float32 / float64 into np.int16
36
+ Parameters
37
+ ----------
38
+ array: np.array
39
+ type: np.int16
40
+ Returns
41
+ -------
42
+ result : np.array
43
+ """
44
+
45
+ if array .dtype == type :
46
+ return array
47
+
48
+ if array .dtype not in [np .int16 , np .int32 , np .int64 ]:
49
+ if np .max (np .abs (array )) == 0 :
50
+ array [:] = 0
51
+ array = type (array * np .iinfo (type ).max )
52
+ else :
53
+ if divide_max_abs :
54
+ array = type (array / np .max (np .abs (array )) * np .iinfo (type ).max )
55
+ else :
56
+ array = type (array * np .iinfo (type ).max )
57
+
58
+ return array
59
+
60
+ @staticmethod
61
+ def remove_silence (y ):
62
+ threshold = 0.005
63
+ pause_length_in_ms = 200
64
+ keep_at_start_and_end = 50
65
+ counter_below_threshold = 0
66
+ indices_to_remove = []
67
+
68
+ for i , amp in enumerate (y ):
69
+ if abs (amp ) < threshold :
70
+ counter_below_threshold += 1
71
+ else :
72
+ if counter_below_threshold > pause_length_in_ms :
73
+ for index in range (i - counter_below_threshold + keep_at_start_and_end , i - keep_at_start_and_end ):
74
+ indices_to_remove .append (index )
75
+ counter_below_threshold = 0
76
+
77
+ if counter_below_threshold > pause_length_in_ms :
78
+ for index in range (len (y )- counter_below_threshold + keep_at_start_and_end , len (y )- keep_at_start_and_end ):
79
+ indices_to_remove .append (index )
80
+
81
+ y_ = np .delete (y , indices_to_remove )
82
+
83
+ return y_
84
+
85
+ @staticmethod
86
+ def remove_noise (y , sr ):
87
+ # prop_decrease 0.8 only reduces noise by 0.8 -> sound quality is better than at 1.0
88
+ y_ = nr .reduce_noise (y = y , sr = sr , prop_decrease = 0.8 )
89
+
90
+ return y_
91
+
92
+ @staticmethod
93
+ def create_frames (y , frame_size , overlap ):
94
+ frames = []
95
+
96
+ if overlap >= frame_size or frame_size <= 0 or overlap < 0 :
97
+ return frames
98
+
99
+ index = 0
100
+
101
+ while index + frame_size < y .shape [0 ]:
102
+ frames .append (y [index : index + frame_size ])
103
+ index = index + frame_size - overlap
104
+
105
+ return frames
106
+
107
+ @staticmethod
108
+ def window_frames (frames , window_function = np .hanning ):
109
+ windowed_frames = []
110
+
111
+ for frame in frames :
112
+ windowed_frames .append (frame * window_function (frame .shape [0 ]))
113
+
114
+ return windowed_frames
115
+
116
+ @staticmethod
117
+ def load_preprocessed_frames (filepath = None , y = None , sr = None ):
118
+ if filepath is None and (y is None or sr is None ):
119
+ raise ValueError ("Either filepath or y and sr must be given." )
120
+
121
+ if y is None or sr is None :
122
+ y , sr = librosa .load (filepath )
123
+
124
+ y = AudioPreprocessor .remove_noise (y = y , sr = sr )
125
+ y = AudioPreprocessor .remove_silence (y = y )
126
+
127
+ frames = AudioPreprocessor .create_frames (y = y , frame_size = 1000 , overlap = 100 )
128
+ windowed_frames = AudioPreprocessor .window_frames (frames = frames )
129
+
130
+ return windowed_frames
131
+
132
+ def main ():
133
+ frames = AudioPreprocessor .load_preprocessed_frames ("./audio.wav" )
134
+ print (frames )
135
+
136
+ if __name__ == '__main__' :
137
+ main ()
0 commit comments