feat: Add an option to pre-process a PDF file

1 year ago · 5d598d44ca
parent beea5012a3
commit 5d598d44ca
2 changed files with 16 additions and 1 deletions
--- a/src/doc_search/app.py
+++ b/src/doc_search/app.py
@ -10,7 +10,11 @@ from rich import print

 from doc_search import setup_logging
 from doc_search.web import run_web
-from doc_search.workflow import training_workflow_steps, workflow_steps
+from doc_search.workflow import (
+    pre_process_workflow_steps,
+    training_workflow_steps,
+    workflow_steps,
+)


 def parse_args() -> Namespace:
@ -27,6 +31,7 @@ def parse_args() -> Namespace:
    parser.add_argument("-w", "--overwrite-index", action="store_true", help="Overwrite existing index")
    parser.add_argument("-t", "--train", action="store_true", help="Train and index the PDF file")
    parser.add_argument("-a", "--web-app", action="store_true", help="Start WebApp")
+    parser.add_argument("-p", "--pre-process", action="store_true", help="Extract text from PDF file")

    parser.add_argument(
        "-v",
@ -47,6 +52,8 @@ def main() -> None:  # pragma: no cover
        run_web(context)
    elif args.train:
        run_workflow(context, training_workflow_steps())
+    elif args.pre_process:
+        run_workflow(context, pre_process_workflow_steps())
    else:
        run_workflow(context, workflow_steps())
        print("[bold]Question: " + context["input_question"] + "[/bold]")
--- a/src/doc_search/workflow/init.py
+++ b/src/doc_search/workflow/init.py
@ -258,6 +258,14 @@ def training_workflow_steps() -> list:
    ]


+def pre_process_workflow_steps() -> list:
+    return [
+        VerifyInputFile,
+        ConvertPDFToImages,
+        ConvertImagesToText,
+    ]
+
+
 def inference_workflow_steps() -> list:
    return [
        LoadIndex,