feat: Add an option to pre-process a PDF file

doc-sources
namuan 1 year ago
parent beea5012a3
commit 5d598d44ca

@ -10,7 +10,11 @@ from rich import print
from doc_search import setup_logging
from doc_search.web import run_web
from doc_search.workflow import training_workflow_steps, workflow_steps
from doc_search.workflow import (
pre_process_workflow_steps,
training_workflow_steps,
workflow_steps,
)
def parse_args() -> Namespace:
@ -27,6 +31,7 @@ def parse_args() -> Namespace:
parser.add_argument("-w", "--overwrite-index", action="store_true", help="Overwrite existing index")
parser.add_argument("-t", "--train", action="store_true", help="Train and index the PDF file")
parser.add_argument("-a", "--web-app", action="store_true", help="Start WebApp")
parser.add_argument("-p", "--pre-process", action="store_true", help="Extract text from PDF file")
parser.add_argument(
"-v",
@ -47,6 +52,8 @@ def main() -> None: # pragma: no cover
run_web(context)
elif args.train:
run_workflow(context, training_workflow_steps())
elif args.pre_process:
run_workflow(context, pre_process_workflow_steps())
else:
run_workflow(context, workflow_steps())
print("[bold]Question: " + context["input_question"] + "[/bold]")

@ -258,6 +258,14 @@ def training_workflow_steps() -> list:
]
def pre_process_workflow_steps() -> list:
return [
VerifyInputFile,
ConvertPDFToImages,
ConvertImagesToText,
]
def inference_workflow_steps() -> list:
return [
LoadIndex,

Loading…
Cancel
Save