[huggingface] Allows retry failed model in model converter (#1989)

frankfliu · zachgk · web-flow · commit d3b2b55e9b73 · 2022-09-06T15:30:24.000-07:00
* [huggingface] Allows retry failed model in model converter

Co-authored-by: Zach Kimberg &lt;zachary@kimberg.com&gt;
diff --git a/extensions/tokenizers/src/main/python/.gitignore b/extensions/tokenizers/src/main/python/.gitignore
@@ -1,3 +1,4 @@
 __pycache__
 model/
-processed_models.json
+tmp/
+models.json
diff --git a/extensions/tokenizers/src/main/python/arg_parser.py b/extensions/tokenizers/src/main/python/arg_parser.py
@@ -23,6 +23,10 @@ def converter_args():
                         default=1,
                         help="Max amount of models to convert")
     parser.add_argument("-o", "--output-dir", help="Model output directory")
+    parser.add_argument("-r",
+                        "--retry-failed",
+                        action='store_true',
+                        help="Retry failed model")
     group = parser.add_mutually_exclusive_group(required=True)
     group.add_argument(
         "-c",
diff --git a/extensions/tokenizers/src/main/python/huggingface_models.py b/extensions/tokenizers/src/main/python/huggingface_models.py
@@ -25,7 +25,7 @@
     "ForTokenClassification": "token-classification",
     "ForSequenceClassification": "text-classification",
     "ForMultipleChoice": "text-classification",
-    "ForMaskedLM": "fill-mask"
+    "ForMaskedLM": "fill-mask",
 }
 LANGUAGES = ModelSearchArguments().language
 
@@ -48,7 +48,7 @@ def __init__(self, output_dir: str):
         self.output_dir = output_dir
         self.processed_models = {}
 
-        output_path = os.path.join(output_dir, "processed_models.json")
+        output_path = os.path.join(output_dir, "models.json")
         if os.path.exists(output_path):
             with open(output_path, "r") as f:
                 self.processed_models = json.load(f)
@@ -90,8 +90,9 @@ def list_models(self, args: Namespace) -> List[dict]:
             existing_model = self.processed_models.get(model_id)
             if existing_model:
                 existing_model["downloads"] = model_info.downloads
-                logging.info(f"Skip converted mode: {model_id}.")
-                continue
+                if not args.retry_failed:
+                    logging.info(f"Skip converted model: {model_id}.")
+                    continue
 
             try:
                 config = hf_hub_download(repo_id=model_id,