Skip to content

Commit 23800cb

Browse files
authored
Merge pull request #306 from makcedward/dev
Release 1.1.11
2 parents 487d9c8 + d44804d commit 23800cb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+639
-605
lines changed

CHANGE.md

+6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
NLPAUG Change Log
22
================
33

4+
### 1.1.11 Jul 6, 2022
5+
* [Return list of output](https://github.com/makcedward/nlpaug/issues/302)
6+
* [Fix download util](https://github.com/makcedward/nlpaug/issues/301)
7+
* [Fix lambda label misalignment](https://github.com/makcedward/nlpaug/issues/295)
8+
* [Add language pack reference link for SynonymAug](https://github.com/makcedward/nlpaug/issues/289)
9+
410
### 1.1.10 Dec 23, 2021
511
* [KeywordAug supports Turkish](https://github.com/makcedward/nlpaug/pull/261)
612
* [Fix FrequencyMasking time range ](https://github.com/makcedward/nlpaug/pull/258)

README.md

+7-6
Original file line numberDiff line numberDiff line change
@@ -139,16 +139,17 @@ http://paraphrase.org/#/download
139139

140140
If you use PitchAug, SpeedAug and VtlpAug, installing the following dependencies as well
141141
```bash
142-
pip install librosa>=0.7.1 matplotlib
142+
pip install librosa>=0.9.1 matplotlib
143143
```
144144

145145
## Recent Changes
146146

147-
### 1.1.10 Dec 23, 2021
148-
* [KeywordAug supports Turkish](https://github.com/makcedward/nlpaug/pull/261)
149-
* [Fix FrequencyMasking time range ](https://github.com/makcedward/nlpaug/pull/258)
150-
* [Remove unnecessary printout](https://github.com/makcedward/nlpaug/pull/263)
151-
* [Rollback ContextualWordEmbsForSentenceAug and AbstSummAug to use custom transformers API to reduce execution time]
147+
### 1.1.11 Jul 6, 2022
148+
* [Return list of output](https://github.com/makcedward/nlpaug/issues/302)
149+
* [Fix download util](https://github.com/makcedward/nlpaug/issues/301)
150+
* [Fix lambda label misalignment](https://github.com/makcedward/nlpaug/issues/295)
151+
* [Add language pack reference link for SynonymAug](https://github.com/makcedward/nlpaug/issues/289)
152+
152153

153154
See [changelog](https://github.com/makcedward/nlpaug/blob/master/CHANGE.md) for more details.
154155

docs/conf.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,9 @@ def __getattr__(cls, name):
7474
# built documents.
7575
#
7676
# The short X.Y version.
77-
version = '1.1.11_dev'
77+
version = '1.1.11'
7878
# The full version, including alpha/beta/rc tags.
79-
release = '1.1.11_dev'
79+
release = '1.1.11'
8080

8181
# The language for content autogenerated by Sphinx. Refer to documentation
8282
# for a list of supported languages.

nlpaug/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
__all__ = ['base_augmenter']
55

6-
__version__ = '1.1.11_dev'
6+
__version__ = '1.1.11'
77
__description__ = 'Natural language processing augmentation library for deep neural networks.'
88
__url__ = 'https://github.com/makcedward/nlpaug'
99
__author__ = 'Edward Ma'

nlpaug/augmenter/sentence/context_word_embs_sentence.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import os
6+
from typing import Iterable
67

78
from nlpaug.augmenter.sentence import SentenceAugmenter
89
import nlpaug.model.lang_models as nml
@@ -102,14 +103,15 @@ def insert(self, data):
102103
if not data:
103104
return data
104105

105-
if isinstance(data, list):
106-
all_data = data
107-
else:
106+
if isinstance(data, str):
108107
if data.strip() == '':
109108
return data
110-
111109
all_data = [data]
112-
110+
elif isinstance(data, Iterable):
111+
all_data = data
112+
else:
113+
all_data = [data]
114+
113115
if self.use_custom_api:
114116
return self._custom_insert(all_data)
115117
else:

nlpaug/augmenter/sentence/sentence_augmenter.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@ def __init__(self, action, name='Sentence_Aug', stopwords=None, tokenizer=None,
1818

1919
@classmethod
2020
def clean(cls, data):
21+
if isinstance(data, str):
22+
return data.strip()
2123
if isinstance(data, Iterable):
2224
return [d.strip() for d in data]
23-
return data.strip()
25+
return str(data).strip()
2426

2527
@classmethod
2628
def is_duplicate(cls, dataset, data):

nlpaug/augmenter/word/synonym.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ class SynonymAug(WordAugmenter):
3232
3333
:param str aug_src: Support 'wordnet' and 'ppdb' .
3434
:param str model_path: Path of dictionary. Mandatory field if using PPDB as data source
35-
:param str lang: Language of your text. Default value is 'eng'.
35+
:param str lang: Language of your text. Default value is 'eng'. For `wordnet`, you can choose lang from this list
36+
http://compling.hss.ntu.edu.sg/omw/. For `ppdb`, you simply download corresponding langauge pack from
37+
http://paraphrase.org/#/download.
3638
:param float aug_p: Percentage of word will be augmented.
3739
:param int aug_min: Minimum number of word will be augmented.
3840
:param int aug_max: Maximum number of word will be augmented. If None is passed, number of augmentation is

nlpaug/augmenter/word/word_augmenter.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,11 @@ def __init__(self, action, name='Word_Aug', aug_min=1, aug_max=10, aug_p=0.3, st
2323

2424
@classmethod
2525
def clean(cls, data):
26+
if isinstance(data, str):
27+
return data.strip()
2628
if isinstance(data, Iterable) :
2729
return [d.strip() if d else d for d in data]
28-
return data.strip()
30+
return str(data).strip()
2931

3032
def skip_aug(self, token_idxes, tokens):
3133
return token_idxes

nlpaug/base_augmenter.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,13 @@ def augment(self, data, n=1, num_thread=1):
6363

6464
# Return empty value per data type
6565
if isinstance(data, str):
66-
return ''
66+
return []
6767
elif isinstance(data, list):
6868
return []
6969
elif isinstance(data, np.ndarray):
7070
return np.array([])
7171

72-
return None
72+
return []
7373

7474
action_fx = None
7575
clean_data = self.clean(data)
@@ -125,10 +125,9 @@ def augment(self, data, n=1, num_thread=1):
125125
if len(augmented_results) >= expected_output_num:
126126
break
127127

128-
# TODO: standardize output to list even though n=1 from 1.0.0
129128
if len(augmented_results) == 0:
130129
# if not result, return itself
131-
if n == 1:
130+
if isinstance(data, list):
132131
return data
133132
# Single input with/without multiple input
134133
else:
@@ -140,8 +139,6 @@ def augment(self, data, n=1, num_thread=1):
140139
if isinstance(data, list):
141140
return augmented_results
142141
else:
143-
if n == 1:
144-
return augmented_results[0]
145142
return augmented_results[:n]
146143

147144
# return augmented_results

nlpaug/flow/pipeline.py

+8-16
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def augment(self, data, n=1, num_thread=1):
5959
else:
6060
if self.device == 'cpu':
6161
augmented_results = self._parallel_augment(self._augment, data, n=n, num_thread=num_thread)
62+
6263
# TODO: Externalize to util for checking
6364
elif 'cuda' in self.device:
6465
# TODO: support multiprocessing for GPU
@@ -67,24 +68,21 @@ def augment(self, data, n=1, num_thread=1):
6768
else:
6869
raise ValueError('Unsupported device mode [{}]. Only support `cpu` or `cuda`'.format(self.device))
6970

71+
# Flatten nested list
72+
augmented_results = [r for sub_results in augmented_results for r in sub_results if len(r) > 0]
7073
for augmented_result in augmented_results:
7174
if is_duplicate_fx is not None and not is_duplicate_fx(results + [data], augmented_result):
72-
results.append(augmented_result)
75+
results.extend(augmented_result)
7376

7477
if len(results) >= n:
7578
break
7679
if len(results) >= n:
7780
break
7881

79-
# TODO: standardize output to list even though n=1
8082
if len(results) == 0:
81-
# if not result, return itself
82-
if n == 1:
83-
return data
84-
else:
85-
return [data]
86-
if n == 1:
87-
return results[0]
83+
if len(data) == 0:
84+
return []
85+
return [data]
8886
return results[:n]
8987

9088
def _augment(self, data, n=1, num_thread=1):
@@ -115,16 +113,10 @@ def _augment(self, data, n=1, num_thread=1):
115113
results.append(augmented_data)
116114
break
117115

118-
# TODO: standardize output to list even though n=1
119116
output = None
120117
if len(results) == 0:
121118
# if not result, return itself
122-
if n == 1:
123-
output = data
124-
else:
125-
output = [data]
126-
elif n == 1:
127-
output = results[0]
119+
output = [data]
128120
else:
129121
output = results[:n]
130122

nlpaug/model/audio/pitch.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,6 @@ def __init__(self):
2121
def manipulate(self, data, start_pos, end_pos, pitch_level, sampling_rate):
2222
aug_data = data.copy()
2323
aug_data[start_pos:end_pos] = librosa.effects.pitch_shift(
24-
aug_data[start_pos:end_pos], sampling_rate, pitch_level)
24+
y=aug_data[start_pos:end_pos], sr=sampling_rate, n_steps=pitch_level)
2525

2626
return aug_data

nlpaug/model/audio/speed.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,5 @@ def __init__(self):
1919
raise ModuleNotFoundError('Missed librosa library. Install it via `pip install librosa`')
2020

2121
def manipulate(self, data, start_pos, end_pos, speed):
22-
aug_data = librosa.effects.time_stretch(data[start_pos:end_pos], speed)
22+
aug_data = librosa.effects.time_stretch(y=data[start_pos:end_pos], rate=speed)
2323
return np.concatenate((data[:start_pos], aug_data, data[end_pos:]), axis=0)

nlpaug/model/lang_models/lambada.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def _generate(self, texts, n):
6464
results = []
6565
# Encode
6666
for label in texts:
67-
input_text = 'label_{} {}'.format(label, self.sep_token)
67+
input_text = '{} {}'.format(label, self.sep_token)
6868
input_ids = self.gen_tokenizer.encode(input_text, add_special_tokens=False, return_tensors='pt')
6969
input_ids = input_ids.to(self.device)
7070

nlpaug/model/word_dict/wordnet.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,13 @@ def __init__(self, lang, is_synonym=True):
3333
self.model = self.read()
3434

3535
def read(self):
36-
return wordnet
36+
try:
37+
wordnet.synsets('testing')
38+
return wordnet
39+
except LookupError:
40+
nltk.download('wordnet')
41+
nltk.download('omw-1.4')
42+
return wordnet
3743

3844
def predict(self, word, pos=None):
3945
results = []
@@ -48,4 +54,10 @@ def predict(self, word, pos=None):
4854

4955
@classmethod
5056
def pos_tag(cls, tokens):
51-
return nltk.pos_tag(tokens)
57+
try:
58+
results = nltk.pos_tag(tokens)
59+
except LookupError:
60+
nltk.download('averaged_perceptron_tagger')
61+
results = nltk.pos_tag(tokens)
62+
63+
return results

requirements_dev.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ pyinstrument
55
transformers
66
torch
77
simpletransformers
8-
gensim>=4.1.2
8+
gensim>=4.1.2
9+
librosa>=0.9

scripts/lambada/data_processing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
def prepare_mlm_data(labels, texts, output_file_path, sep_token):
77
with open(os.path.join(output_file_path, 'mlm_data.txt'), 'w') as f:
88
for label, text in zip(labels, texts):
9-
f.write(' '.join([label, sep_token, text]) + '\n')
9+
f.write(' '.join([str(label), sep_token, text]) + '\n')
1010

1111
def main(args):
1212
data = pd.read_csv(args.data_path)

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
setup(
1414
name="nlpaug",
15-
version="1.1.11_dev",
15+
version="1.1.11",
1616
author="Edward Ma",
1717
author_email="[email protected]",
1818
url="https://github.com/makcedward/nlpaug",

test/augmenter/audio/test_audio.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,5 @@ def test_coverage_and_zone(self):
5151

5252
for aug in augs:
5353
aug_data = aug.augment(self.audio)
54-
self.assertTrue(len(aug_data[aug.start_pos:aug.end_pos]), int(len(self.audio) * (zone[1] - zone[0]) * coverage))
54+
aug_audio = aug_data[0]
55+
self.assertTrue(len(aug_audio[aug.start_pos:aug.end_pos]), int(len(self.audio) * (zone[1] - zone[0]) * coverage))

test/augmenter/audio/test_crop.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -22,21 +22,24 @@ def setUpClass(cls):
2222
def test_empty_input(self):
2323
audio = np.array([])
2424
aug = naa.CropAug(sampling_rate=self.sampling_rate)
25-
augmented_audio = aug.augment(audio)
25+
augmented_data = aug.augment(audio)
2626

27-
self.assertTrue(np.array_equal(audio, augmented_audio))
27+
self.assertTrue(np.array_equal(audio, augmented_data))
2828

2929
def test_substitute(self):
3030
aug = naa.CropAug(sampling_rate=self.sampling_rate)
31-
augmented_audio = aug.augment(self.audio)
31+
augmented_data = aug.augment(self.audio)
32+
augmented_audio = augmented_data[0]
3233

3334
self.assertNotEqual(len(self.audio), len(augmented_audio))
3435

3536
def test_coverage(self):
3637
aug = naa.CropAug(sampling_rate=self.sampling_rate, coverage=0.1)
3738
augmented_data = aug.augment(self.audio)
39+
augmented_audio = augmented_data[0]
40+
3841
audio_size = len(self.audio)
39-
augmented_size = len(augmented_data)
42+
augmented_size = len(augmented_audio)
4043
expected_crop_size = len(self.audio) * (aug.zone[1] - aug.zone[0]) * 0.1
4144

4245
self.assertTrue(-1 <= audio_size - augmented_size - expected_crop_size <= 1)
@@ -47,8 +50,10 @@ def test_duration(self):
4750

4851
for _ in range(10):
4952
aug = naa.CropAug(sampling_rate=self.sampling_rate, duration=duration, stateless=False)
50-
aug_data = aug.augment(self.audio)
51-
aug_size = len(aug_data)
53+
augmented_data = aug.augment(self.audio)
54+
augmented_audio = augmented_data[0]
55+
56+
aug_size = len(augmented_audio)
5257
expected_crop_size = self.sampling_rate * duration
5358

5459
self.assertGreater(audio_size, aug_size)

test/augmenter/audio/test_inversion.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,14 @@ def setUpClass(cls):
2222
def test_empty_input(self):
2323
audio = np.array([])
2424
aug = naa.PolarityInverseAug()
25-
augmented_audio = aug.augment(audio)
25+
augmented_data = aug.augment(audio)
2626

27-
self.assertTrue(np.array_equal(audio, augmented_audio))
27+
self.assertTrue(np.array_equal(audio, augmented_data))
2828

2929
def test_inverse(self):
3030
aug = naa.PolarityInverseAug()
31-
augmented_audio = aug.augment(self.audio)
31+
augmented_data = aug.augment(self.audio)
32+
augmented_audio = augmented_data[0]
3233

3334
self.assertFalse(np.array_equal(self.audio, augmented_audio))
3435
self.assertEqual(len(self.audio), len(augmented_audio))

test/augmenter/audio/test_loudness.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,14 @@ def setUpClass(cls):
2222
def test_empty_input(self):
2323
audio = np.array([])
2424
aug = naa.LoudnessAug()
25-
augmented_audio = aug.augment(audio)
25+
augmented_data = aug.augment(audio)
2626

27-
self.assertTrue(np.array_equal(audio, augmented_audio))
27+
self.assertTrue(np.array_equal(audio, augmented_data))
2828

2929
def test_substitute(self):
3030
aug = naa.LoudnessAug()
31-
augmented_audio = aug.augment(self.audio)
31+
augmented_data = aug.augment(self.audio)
32+
augmented_audio = augmented_data[0]
3233

3334
self.assertFalse(np.array_equal(self.audio, augmented_audio))
3435
self.assertEqual(len(self.audio), len(augmented_audio))

0 commit comments

Comments
 (0)