Skip to content
This repository was archived by the owner on May 3, 2023. It is now read-only.

Commit 4aa68d6

Browse files
author
Markus Konrad
committed
better checking for installed spacy models
1 parent 5f17044 commit 4aa68d6

File tree

2 files changed

+16
-10
lines changed

2 files changed

+16
-10
lines changed

tests/test_corpus.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ def test_corpus_init_otherlang_by_langcode():
344344
if langcode in {'en', 'de'}: continue # this is already tested
345345

346346
if langcode not in installed_lang:
347-
with pytest.raises(SystemExit):
347+
with pytest.raises(RuntimeError):
348348
c.Corpus(docs, language=langcode)
349349
else:
350350
corp = c.Corpus(docs, language=langcode)
@@ -2152,7 +2152,7 @@ def test_corpus_from_builtin_corpus(max_workers, sample):
21522152
lang = corpname[:2]
21532153

21542154
if lang not in installed_lang:
2155-
with pytest.raises(SystemExit):
2155+
with pytest.raises(RuntimeError):
21562156
c.Corpus.from_builtin_corpus(corpname, **kwargs)
21572157
else:
21582158
corp = c.Corpus.from_builtin_corpus(corpname, **kwargs)
@@ -3304,7 +3304,7 @@ def test_builtin_corpora_info(with_paths):
33043304
lang = name[:2]
33053305

33063306
if lang not in installed_lang:
3307-
with pytest.raises(SystemExit):
3307+
with pytest.raises(RuntimeError):
33083308
c.Corpus.from_builtin_corpus(name, load_features=[], sample=5)
33093309
else:
33103310
corp = c.Corpus.from_builtin_corpus(name, load_features=[], sample=5)

tmtoolkit/corpus/_corpus.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def __init__(self, docs: Optional[Union[Dict[str, str], Sequence[Document]]] = N
137137

138138
# declare public attributes
139139
#: SpaCy Language instance
140-
self.nlp: Language
140+
self.nlp: Optional[Language] = None
141141
#: preprocessing pipeline for raw input text; must consist of functions that accept a string and return
142142
# a processed string
143143
self.raw_preproc: List[Callable]
@@ -206,14 +206,14 @@ def __init__(self, docs: Optional[Union[Dict[str, str], Sequence[Document]]] = N
206206
language_model = DEFAULT_LANGUAGE_MODELS[language] + '_' + model_suffix
207207

208208
# model meta information
209-
try:
210-
model_info = spacy.info(language_model)
211-
except (RuntimeError, SystemExit):
209+
if language_model not in spacy.util.get_installed_models():
212210
raise RuntimeError(f'language model "{language_model}" cannot be loaded; are you sure it is installed? '
213211
f'see https://spacy.io/models or '
214212
f'https://tmtoolkit.readthedocs.io/en/latest/install.html for further information '
215213
f'on installing language models')
216214

215+
model_info = spacy.info(language_model)
216+
217217
# the default pipeline compenents for SpaCy language models – these would be loaded *and enabled* if not
218218
# explicitly excluded
219219
default_components = set(model_info['pipeline'])
@@ -534,17 +534,23 @@ def ngrams_join_str(self) -> str:
534534
@property
535535
def language(self) -> str:
536536
"""Return Corpus language as two-letter ISO 639-1 language code."""
537-
return self.nlp.lang
537+
if self.nlp:
538+
return self.nlp.lang
539+
else:
540+
return '<not initialized>'
538541

539542
@property
540543
def language_model(self) -> str:
541544
"""Return name of the language model that was loaded."""
542-
return self.nlp.lang + '_' + self.nlp.meta['name']
545+
if self.nlp:
546+
return self.nlp.lang + '_' + self.nlp.meta['name']
547+
else:
548+
return '<not initialized>'
543549

544550
@property
545551
def has_sents(self) -> bool:
546552
"""Return True if information sentence borders were parsed for documents in this corpus, else return False."""
547-
return 'parser' in self.nlp.pipe_names or 'senter' in self.nlp.pipe_names
553+
return self.nlp and ('parser' in self.nlp.pipe_names or 'senter' in self.nlp.pipe_names)
548554

549555
@property
550556
def doc_labels(self) -> List[str]:

0 commit comments

Comments
 (0)