better checking for installed spacy models

Markus Konrad · Markus Konrad · commit 4aa68d6cd8d6 · 2022-03-11T13:53:13.000+01:00
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
@@ -344,7 +344,7 @@ def test_corpus_init_otherlang_by_langcode():
         if langcode in {'en', 'de'}: continue  # this is already tested
 
         if langcode not in installed_lang:
-            with pytest.raises(SystemExit):
+            with pytest.raises(RuntimeError):
                 c.Corpus(docs, language=langcode)
         else:
             corp = c.Corpus(docs, language=langcode)
@@ -2152,7 +2152,7 @@ def test_corpus_from_builtin_corpus(max_workers, sample):
             lang = corpname[:2]
 
             if lang not in installed_lang:
-                with pytest.raises(SystemExit):
+                with pytest.raises(RuntimeError):
                     c.Corpus.from_builtin_corpus(corpname, **kwargs)
             else:
                 corp = c.Corpus.from_builtin_corpus(corpname, **kwargs)
@@ -3304,7 +3304,7 @@ def test_builtin_corpora_info(with_paths):
             lang = name[:2]
 
             if lang not in installed_lang:
-                with pytest.raises(SystemExit):
+                with pytest.raises(RuntimeError):
                     c.Corpus.from_builtin_corpus(name, load_features=[], sample=5)
             else:
                 corp = c.Corpus.from_builtin_corpus(name, load_features=[], sample=5)
diff --git a/tmtoolkit/corpus/_corpus.py b/tmtoolkit/corpus/_corpus.py
@@ -137,7 +137,7 @@ def __init__(self, docs: Optional[Union[Dict[str, str], Sequence[Document]]] = N
 
         # declare public attributes
         #: SpaCy Language instance
-        self.nlp: Language
+        self.nlp: Optional[Language] = None
         #: preprocessing pipeline for raw input text; must consist of functions that accept a string and return
         #  a processed string
         self.raw_preproc: List[Callable]
@@ -206,14 +206,14 @@ def __init__(self, docs: Optional[Union[Dict[str, str], Sequence[Document]]] = N
                 language_model = DEFAULT_LANGUAGE_MODELS[language] + '_' + model_suffix
 
             # model meta information
-            try:
-                model_info = spacy.info(language_model)
-            except (RuntimeError, SystemExit):
+            if language_model not in spacy.util.get_installed_models():
                 raise RuntimeError(f'language model "{language_model}" cannot be loaded; are you sure it is installed? '
                                    f'see https://spacy.io/models or '
                                    f'https://tmtoolkit.readthedocs.io/en/latest/install.html for further information '
                                    f'on installing language models')
 
+            model_info = spacy.info(language_model)
+
             # the default pipeline compenents for SpaCy language models – these would be loaded *and enabled* if not
             # explicitly excluded
             default_components = set(model_info['pipeline'])
@@ -534,17 +534,23 @@ def ngrams_join_str(self) -> str:
     @property
     def language(self) -> str:
         """Return Corpus language as two-letter ISO 639-1 language code."""
-        return self.nlp.lang
+        if self.nlp:
+            return self.nlp.lang
+        else:
+            return '<not initialized>'
 
     @property
     def language_model(self) -> str:
         """Return name of the language model that was loaded."""
-        return self.nlp.lang + '_' + self.nlp.meta['name']
+        if self.nlp:
+            return self.nlp.lang + '_' + self.nlp.meta['name']
+        else:
+            return '<not initialized>'
 
     @property
     def has_sents(self) -> bool:
         """Return True if information sentence borders were parsed for documents in this corpus, else return False."""
-        return 'parser' in self.nlp.pipe_names or 'senter' in self.nlp.pipe_names
+        return self.nlp and ('parser' in self.nlp.pipe_names or 'senter' in self.nlp.pipe_names)
 
     @property
     def doc_labels(self) -> List[str]: