diff --git a/src/doc_search/workflow/__init__.py b/src/doc_search/workflow/__init__.py index 3e04d12..e81f983 100644 --- a/src/doc_search/workflow/__init__.py +++ b/src/doc_search/workflow/__init__.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging import os import pickle +import platform import shutil import warnings from pathlib import Path @@ -99,11 +100,30 @@ class VerifyInputFile(WorkflowBase): } +class ImageMagickCommand(WorkflowBase): + """ + Use command based on OS + """ + + def execute(self) -> dict: + command: str | None = "convert" + + if platform.system() == "Windows": + image_magick_path = os.getenv("IMCONV") + assert ( + image_magick_path is not None + ), "IMCONV environment variable not set. It should point to location of ImageMagick's magick.exe" + command = f"{image_magick_path} {command}" + + return {"convert_command": command} + + class ConvertPDFToImages(WorkflowBase): """ Convert PDF to images using ImageMagick """ + convert_command: str input_pdf_path: Path app_dir: Path start_page: int @@ -118,7 +138,7 @@ class ConvertPDFToImages(WorkflowBase): image_path = output_dir / f"output-{i}.png" if image_path.exists(): continue - convert_command = f"""convert -density 150 -trim -background white -alpha remove -quality 100 -sharpen 0x1.0 {input_file_page} -quality 100 {image_path}""" + convert_command = f"""{self.convert_command} -density 150 -trim -background white -alpha remove -quality 100 -sharpen 0x1.0 {input_file_page} -quality 100 {image_path}""" run_command(convert_command) return {"pdf_images_path": output_dir} @@ -294,6 +314,7 @@ ${question} def training_workflow_steps() -> list: return [ VerifyInputFile, + ImageMagickCommand, ConvertPDFToImages, ConvertImagesToText, CombineAllText, @@ -304,6 +325,7 @@ def training_workflow_steps() -> list: def pre_process_workflow_steps() -> list: return [ VerifyInputFile, + ImageMagickCommand, ConvertPDFToImages, ConvertImagesToText, ] diff --git a/tests/workflow_steps_test.py b/tests/workflow_steps_test.py index 6b69eef..a9f7b7b 100644 --- a/tests/workflow_steps_test.py +++ b/tests/workflow_steps_test.py @@ -4,6 +4,7 @@ from doc_search.workflow import ( ConvertImagesToText, ConvertPDFToImages, CreateIndex, + ImageMagickCommand, LoadIndex, VerifyInputFile, workflow_steps, @@ -15,6 +16,7 @@ def test_return_expected_workflow() -> None: assert expected_workflow_steps == [ VerifyInputFile, + ImageMagickCommand, ConvertPDFToImages, ConvertImagesToText, CombineAllText,