Yurii Paniv commited on
Commit
b1e6f9e
1 Parent(s): 0605b3e

Handle numbers properly

Browse files
app.py CHANGED
@@ -18,13 +18,15 @@ from torch.cuda import is_available
18
 
19
  class VoiceOption(Enum):
20
  Sevil = "Севіль (жіночий) 👩"
21
- #Arslan = "Арслан (чоловічий) 👨"
22
  Eskander = "Ескандер (чоловічий) 👨"
23
  # Abibulla = "Абібулла (чоловічий) 👨"
24
 
25
 
26
  def check_thread(logging_queue: Queue):
27
- logging_callback = log_data(hf_token=getenv("HF_API_TOKEN"), dataset_name="crh-tts-output", private=False)
 
 
28
  while True:
29
  sleep(60)
30
  batch = []
@@ -35,10 +37,13 @@ def check_thread(logging_queue: Queue):
35
  try:
36
  logging_callback(batch)
37
  except:
38
- print("Error happened while pushing data to HF. Puttting items back in queue...")
 
 
39
  for item in batch:
40
  logging_queue.put(item)
41
 
 
42
  if getenv("HF_API_TOKEN") is not None:
43
  log_queue = Queue()
44
  t = Thread(target=check_thread, args=(log_queue,))
@@ -62,9 +67,9 @@ def tts(text: str, voice: str):
62
 
63
  voice_mapping = {
64
  VoiceOption.Sevil.value: Voices.Sevil.value,
65
- #VoiceOption.Arslan.value: Voices.Arslan.value,
66
  VoiceOption.Eskander.value: Voices.Eskander.value,
67
- #VoiceOption.Abibulla.value: Voices.Abibulla.value,
68
  }
69
 
70
  speaker_name = voice_mapping[voice]
@@ -114,6 +119,7 @@ iface = gr.Interface(
114
  ],
115
  ["Селям! Ишлер насыл?", VoiceOption.Sevil.value],
116
  ["Selâm! 123456789", VoiceOption.Eskander.value],
 
117
  ],
118
  )
119
  iface.launch()
 
18
 
19
  class VoiceOption(Enum):
20
  Sevil = "Севіль (жіночий) 👩"
21
+ # Arslan = "Арслан (чоловічий) 👨"
22
  Eskander = "Ескандер (чоловічий) 👨"
23
  # Abibulla = "Абібулла (чоловічий) 👨"
24
 
25
 
26
  def check_thread(logging_queue: Queue):
27
+ logging_callback = log_data(
28
+ hf_token=getenv("HF_API_TOKEN"), dataset_name="crh-tts-output", private=False
29
+ )
30
  while True:
31
  sleep(60)
32
  batch = []
 
37
  try:
38
  logging_callback(batch)
39
  except:
40
+ print(
41
+ "Error happened while pushing data to HF. Puttting items back in queue..."
42
+ )
43
  for item in batch:
44
  logging_queue.put(item)
45
 
46
+
47
  if getenv("HF_API_TOKEN") is not None:
48
  log_queue = Queue()
49
  t = Thread(target=check_thread, args=(log_queue,))
 
67
 
68
  voice_mapping = {
69
  VoiceOption.Sevil.value: Voices.Sevil.value,
70
+ # VoiceOption.Arslan.value: Voices.Arslan.value,
71
  VoiceOption.Eskander.value: Voices.Eskander.value,
72
+ # VoiceOption.Abibulla.value: Voices.Abibulla.value,
73
  }
74
 
75
  speaker_name = voice_mapping[voice]
 
119
  ],
120
  ["Селям! Ишлер насыл?", VoiceOption.Sevil.value],
121
  ["Selâm! 123456789", VoiceOption.Eskander.value],
122
+ ["Selâm! 1,2,3,4,5,6,789", VoiceOption.Eskander.value],
123
  ],
124
  )
125
  iface.launch()
crh_preprocessor/preprocessor.py CHANGED
@@ -11,44 +11,44 @@ mapping = {
11
  "s\u0327": "\u015f",
12
  "a\u0302": "\xe2",
13
  "w": "v",
14
- "x": "ks"
15
  }
16
 
17
  zero = {
18
- 0: 'sıfır',
19
  }
20
 
21
  numbers_map = {
22
- 1: 'bir',
23
- 2: 'eki',
24
- 3: 'üç',
25
- 4: 'dört',
26
- 5: 'beş',
27
- 6: 'altı',
28
- 7: 'yedi',
29
- 8: 'sekiz',
30
- 9: 'doquz',
31
- 10: 'on',
32
- 20: 'yigirmi',
33
- 30: 'otuz',
34
- 40: 'qırq',
35
- 50: 'elli',
36
- 60: 'altmış',
37
- 70: 'yetmiş',
38
- 80: 'seksen',
39
- 90: 'doqsan',
40
- 100: 'yüz',
41
- 1000: 'biñ',
42
- 1_000_000: 'million',
43
- 1_000_000_000: 'milliard'
44
  }
45
 
46
 
47
  def spell_numbers(numbers: str) -> str:
48
- numbers_map_with_zero = {**numbers_map,**zero}
49
  for i in range(0, 10):
50
- numbers = numbers.replace(str(i), numbers_map_with_zero[i] + ' ')
51
- return numbers.strip()
52
 
53
 
54
  def num2word(n):
@@ -58,24 +58,32 @@ def num2word(n):
58
  tens = (n // 10) * 10
59
  units = n % 10
60
  if units == 0:
61
- return ''
62
- return (numbers_map[tens] + ' ' + numbers_map[units]).strip()
63
  elif n < 1000:
64
  hundreds = n // 100
65
  rest = n % 100
66
- return (num2word(hundreds) + ' ' + numbers_map[100] + ' ' + num2word(rest)).strip()
 
 
67
  elif n < 1_000_000:
68
  thousands = n // 1_000
69
  rest = n % 1_000
70
- return (num2word(thousands) + ' ' + numbers_map[1_000] + ' ' + num2word(rest)).strip()
 
 
71
  elif n < 1_000_000_000:
72
  millions = n // 1_000_000
73
  rest = n % 1_000_000
74
- return (num2word(millions) + ' ' + numbers_map[1_000_000] + ' ' + num2word(rest)).strip()
 
 
75
  elif n < 1_000_000_000_000:
76
  billions = n // 1_000_000_000
77
  rest = n % 1_000_000_000
78
- return (num2word(billions) + ' ' + numbers_map[1_000_000_000] + ' ' + num2word(rest)).strip()
 
 
79
  else:
80
  return spell_numbers(str(n))
81
 
@@ -87,20 +95,29 @@ def preprocess(text):
87
  for symbol in mapping.keys():
88
  text = re.sub(symbol, mapping[symbol], text)
89
 
90
- separators = "?!" # TODO: add proper symbols to tts
91
  for symbol in separators:
92
  text = text.replace(symbol, ".")
93
 
94
  while True:
95
- number_match = re.search("-?\d+(\.|,)?(\d+)?", text)
96
-
 
 
 
 
 
 
 
 
 
 
 
97
  if number_match is None:
98
  break
99
-
100
- print(number_match.string, number_match.start(), number_match.end())
101
-
102
- number = number_match.string.strip()
103
 
 
 
104
  prefix = ""
105
 
106
  if number.startswith("-"):
@@ -112,21 +129,33 @@ def preprocess(text):
112
 
113
  if "." in number:
114
  number = number.split(".")
115
- number = prefix + " noqta ".join((num2word(int(number[0])) if int(number[0]) != 0 else spell_numbers(number[0]), spell_numbers(number[1])))
116
- text = text.replace(number_match.string.strip(), number, 1)
 
 
 
 
 
 
 
117
  continue
118
  elif "," in number:
119
  number = number.split(",")
120
- number = prefix + " virgül ".join((num2word(int(number[0])) if int(number[0]) != 0 else spell_numbers(number[0]), spell_numbers(number[1])))
121
- text = text.replace(number_match.string.strip(), number, 1)
 
 
 
 
 
 
 
122
  continue
123
 
124
  if number.startswith("0"):
125
- text = text.replace(number_match.string.strip(), prefix + spell_numbers(number), 1)
126
  continue
127
-
128
- text = text.replace(number_match.string.strip(), prefix + num2word(int(number)), 1)
129
-
130
 
131
  return text.strip()
132
-
 
11
  "s\u0327": "\u015f",
12
  "a\u0302": "\xe2",
13
  "w": "v",
14
+ "x": "ks",
15
  }
16
 
17
  zero = {
18
+ 0: "sıfır",
19
  }
20
 
21
  numbers_map = {
22
+ 1: "bir",
23
+ 2: "eki",
24
+ 3: "üç",
25
+ 4: "dört",
26
+ 5: "beş",
27
+ 6: "altı",
28
+ 7: "yedi",
29
+ 8: "sekiz",
30
+ 9: "doquz",
31
+ 10: "on",
32
+ 20: "yigirmi",
33
+ 30: "otuz",
34
+ 40: "qırq",
35
+ 50: "elli",
36
+ 60: "altmış",
37
+ 70: "yetmiş",
38
+ 80: "seksen",
39
+ 90: "doqsan",
40
+ 100: "yüz",
41
+ 1000: "biñ",
42
+ 1_000_000: "million",
43
+ 1_000_000_000: "milliard",
44
  }
45
 
46
 
47
  def spell_numbers(numbers: str) -> str:
48
+ numbers_map_with_zero = {**numbers_map, **zero}
49
  for i in range(0, 10):
50
+ numbers = numbers.replace(str(i), numbers_map_with_zero[i] + " ")
51
+ return numbers.strip()
52
 
53
 
54
  def num2word(n):
 
58
  tens = (n // 10) * 10
59
  units = n % 10
60
  if units == 0:
61
+ return ""
62
+ return (numbers_map[tens] + " " + numbers_map[units]).strip()
63
  elif n < 1000:
64
  hundreds = n // 100
65
  rest = n % 100
66
+ return (
67
+ num2word(hundreds) + " " + numbers_map[100] + " " + num2word(rest)
68
+ ).strip()
69
  elif n < 1_000_000:
70
  thousands = n // 1_000
71
  rest = n % 1_000
72
+ return (
73
+ num2word(thousands) + " " + numbers_map[1_000] + " " + num2word(rest)
74
+ ).strip()
75
  elif n < 1_000_000_000:
76
  millions = n // 1_000_000
77
  rest = n % 1_000_000
78
+ return (
79
+ num2word(millions) + " " + numbers_map[1_000_000] + " " + num2word(rest)
80
+ ).strip()
81
  elif n < 1_000_000_000_000:
82
  billions = n // 1_000_000_000
83
  rest = n % 1_000_000_000
84
+ return (
85
+ num2word(billions) + " " + numbers_map[1_000_000_000] + " " + num2word(rest)
86
+ ).strip()
87
  else:
88
  return spell_numbers(str(n))
89
 
 
95
  for symbol in mapping.keys():
96
  text = re.sub(symbol, mapping[symbol], text)
97
 
98
+ separators = "?!" # TODO: add proper symbols to tts
99
  for symbol in separators:
100
  text = text.replace(symbol, ".")
101
 
102
  while True:
103
+ groups_match = re.search("((\d,)+){2,}", text)
104
+ if groups_match is not None:
105
+ text = text.replace(
106
+ groups_match.string[groups_match.start() : groups_match.end()],
107
+ " ".join(
108
+ groups_match.string[
109
+ groups_match.start() : groups_match.end()
110
+ ].split(",")
111
+ ),
112
+ )
113
+ continue
114
+
115
+ number_match = re.search("(\-|\+)?(\d)+((\.|,)?\d+)?", text)
116
  if number_match is None:
117
  break
 
 
 
 
118
 
119
+ number = number_match.string[number_match.start() : number_match.end()]
120
+ number_to_replace = number
121
  prefix = ""
122
 
123
  if number.startswith("-"):
 
129
 
130
  if "." in number:
131
  number = number.split(".")
132
+ number = prefix + " noqta ".join(
133
+ (
134
+ num2word(int(number[0]))
135
+ if int(number[0]) != 0
136
+ else spell_numbers(number[0]),
137
+ spell_numbers(number[1]),
138
+ )
139
+ )
140
+ text = text.replace(number_to_replace, number, 1)
141
  continue
142
  elif "," in number:
143
  number = number.split(",")
144
+ number = prefix + " virgül ".join(
145
+ (
146
+ num2word(int(number[0]))
147
+ if int(number[0]) != 0
148
+ else spell_numbers(number[0]),
149
+ spell_numbers(number[1]),
150
+ )
151
+ )
152
+ text = text.replace(number_to_replace, number, 1)
153
  continue
154
 
155
  if number.startswith("0"):
156
+ text = text.replace(number_to_replace, prefix + spell_numbers(number), 1)
157
  continue
158
+
159
+ text = text.replace(number_to_replace, prefix + num2word(int(number)), 1)
 
160
 
161
  return text.strip()
 
crh_tts/tts.py CHANGED
@@ -10,10 +10,10 @@ from torch import no_grad
10
  class Voices(Enum):
11
  """List of available voices for the model."""
12
 
13
- #Arslan = "arslan"
14
  Sevil = "sevil"
15
  Eskander = "eskander"
16
- #Abibulla = "abibulla"
17
 
18
 
19
  class TTS:
 
10
  class Voices(Enum):
11
  """List of available voices for the model."""
12
 
13
+ # Arslan = "arslan"
14
  Sevil = "sevil"
15
  Eskander = "eskander"
16
+ # Abibulla = "abibulla"
17
 
18
 
19
  class TTS:
data_logger.py CHANGED
@@ -3,21 +3,22 @@ import os
3
  import csv
4
  import huggingface_hub
5
 
 
6
  def log_data(hf_token: str, dataset_name: str, private=True):
7
  path_to_dataset_repo = huggingface_hub.create_repo(
8
- repo_id=dataset_name,
9
- token=hf_token,
10
- private=private,
11
- repo_type="dataset",
12
- exist_ok=True,
13
- )
14
  flagging_dir = "flagged"
15
  dataset_dir = os.path.join(flagging_dir, dataset_name)
16
  repo = huggingface_hub.Repository(
17
- local_dir=dataset_dir,
18
- clone_from=path_to_dataset_repo,
19
- use_auth_token=hf_token,
20
- )
21
  repo.git_pull(lfs=True)
22
  log_file = os.path.join(dataset_dir, "data.csv")
23
 
@@ -38,4 +39,3 @@ def log_data(hf_token: str, dataset_name: str, private=True):
38
  return line_count
39
 
40
  return log_function
41
-
 
3
  import csv
4
  import huggingface_hub
5
 
6
+
7
  def log_data(hf_token: str, dataset_name: str, private=True):
8
  path_to_dataset_repo = huggingface_hub.create_repo(
9
+ repo_id=dataset_name,
10
+ token=hf_token,
11
+ private=private,
12
+ repo_type="dataset",
13
+ exist_ok=True,
14
+ )
15
  flagging_dir = "flagged"
16
  dataset_dir = os.path.join(flagging_dir, dataset_name)
17
  repo = huggingface_hub.Repository(
18
+ local_dir=dataset_dir,
19
+ clone_from=path_to_dataset_repo,
20
+ use_auth_token=hf_token,
21
+ )
22
  repo.git_pull(lfs=True)
23
  log_file = os.path.join(dataset_dir, "data.csv")
24
 
 
39
  return line_count
40
 
41
  return log_function
 
tests/test_preprocessor.py CHANGED
@@ -2,69 +2,44 @@ from crh_preprocessor.preprocessor import preprocess, num2word
2
 
3
 
4
  def test_num2word():
 
5
  assert (
6
- num2word(16) == "on altı"
7
- )
8
- assert (
9
- num2word(1324759813) == "bir milliard üç yüz yigirmi dört million yedi yüz elli doquz biñ sekiz yüz on üç"
10
- )
11
- assert (
12
- num2word(1_000_000) == "million"
13
  )
 
14
 
15
 
16
  def test_preprocessor():
17
  assert (
18
  preprocess("İşanç Alla-Taalâğa.") == "işan\u04ab alla-taalâğa."
19
  ) # first i is two symbols (i without dot and dot)
 
20
  assert (
21
- preprocess("1000000") == "million"
22
- )
23
- assert (
24
- preprocess("1324700000") == "bir milliard üç yüz yigirmi dört million yedi yüz biñ"
25
- )
26
- assert (
27
- preprocess("1000002") == "bir million eki"
28
- )
29
- assert (
30
- preprocess("16") == "on altı"
31
- )
32
- assert (
33
- preprocess("001") == "sıfır sıfır bir"
34
- )
35
- assert (
36
- preprocess("00") == "sıfır sıfır"
37
- )
38
- assert (
39
- preprocess("10.02") == "on noqta sıfır eki"
40
- )
41
- assert (
42
- preprocess("0.01") == "sıfır noqta sıfır bir"
43
- )
44
- assert (
45
- preprocess("0,01") == "sıfır virgül sıfır bir"
46
- )
47
- assert (
48
- preprocess("00,01") == "sıfır sıfır virgül sıfır bir"
49
- )
50
- assert (
51
- preprocess("-10") == "minus on"
52
- )
53
- assert (
54
- preprocess("+10") == "plüs on"
55
- )
56
- assert (
57
- preprocess("+10.1400") == "plüs on noqta bir dört sıfır sıfır"
58
- )
59
- assert (
60
- preprocess("-10.14156") == "minus on noqta bir dört bir beş altı"
61
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  assert (
63
- preprocess("10,14156") == "on virgül bir dört bir beş altı"
64
  )
65
  assert (
66
- preprocess("1, 2, 3, 4, 5, 6,7") == "bir virgül eki virgül"
67
  )
 
68
  assert (
69
- preprocess("1,2,3,4,5,6,7") == "on altı"
 
70
  )
 
2
 
3
 
4
  def test_num2word():
5
+ assert num2word(16) == "on altı"
6
  assert (
7
+ num2word(1324759813)
8
+ == "bir milliard üç yüz yigirmi dört million yedi yüz elli doquz biñ sekiz yüz on üç"
 
 
 
 
 
9
  )
10
+ assert num2word(1_000_000) == "million"
11
 
12
 
13
  def test_preprocessor():
14
  assert (
15
  preprocess("İşanç Alla-Taalâğa.") == "işan\u04ab alla-taalâğa."
16
  ) # first i is two symbols (i without dot and dot)
17
+ assert preprocess("1000000") == "million"
18
  assert (
19
+ preprocess("1324700000")
20
+ == "bir milliard üç yüz yigirmi dört million yedi yüz biñ"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  )
22
+ assert preprocess("1000002") == "bir million eki"
23
+ assert preprocess("16") == "on altı"
24
+ assert preprocess("001") == "sıfır sıfır bir"
25
+ assert preprocess("00") == "sıfır sıfır"
26
+ assert preprocess("10.02") == "on noqta sıfır eki"
27
+ assert preprocess("0.01") == "sıfır noqta sıfır bir"
28
+ assert preprocess("0,01") == "sıfır virgül sıfır bir"
29
+ assert preprocess("00,01") == "sıfır sıfır virgül sıfır bir"
30
+ assert preprocess("-10") == "minus on"
31
+ assert preprocess("+10") == "plüs on"
32
+ assert preprocess("+10.1400") == "plüs on noqta bir dört sıfır sıfır"
33
+ assert preprocess("-10.14156") == "minus on noqta bir dört bir beş altı"
34
+ assert preprocess("10,14156") == "on virgül bir dört bir beş altı"
35
  assert (
36
+ preprocess("1, 2, 3, 4, 5, 6,7") == "bir, eki, üç, dört, beş, altı virgül yedi"
37
  )
38
  assert (
39
+ preprocess("1. 2. 3. 4. 5. 6.7") == "bir. eki. üç. dört. beş. altı noqta yedi"
40
  )
41
+ assert preprocess("1,2,3,4,5,6,7") == "bir eki üç dört beş altı yedi"
42
  assert (
43
+ preprocess("1,2,3,4,5,6,74.3")
44
+ == "bir eki üç dört beş altı yetmiş dört noqta üç"
45
  )