feat: Add steps to convert pdf file to scanned pages of text
parent
a41d95f5bf
commit
2a793f8656
@ -1,7 +1,11 @@
|
||||
from .convert_images_to_text import ConvertImagesToText
|
||||
from .convert_pdf_to_pages import ConvertPDFToImages
|
||||
from .verify_input_file import VerifyInputFile
|
||||
|
||||
|
||||
def workflow_steps() -> list:
|
||||
return [
|
||||
VerifyInputFile,
|
||||
ConvertPDFToImages,
|
||||
ConvertImagesToText,
|
||||
]
|
||||
|
@ -0,0 +1,28 @@
|
||||
from pathlib import Path
|
||||
|
||||
from py_executable_checklist.workflow import WorkflowBase, run_command
|
||||
|
||||
|
||||
class ConvertImagesToText(WorkflowBase):
|
||||
"""
|
||||
Convert images to text using tessaract OCR
|
||||
"""
|
||||
|
||||
pdf_images_path: Path
|
||||
input_pdf_path: Path
|
||||
app_dir: Path
|
||||
|
||||
def execute(self) -> dict:
|
||||
pdf_file_name = self.input_pdf_path.stem
|
||||
output_dir = self.app_dir / "OutputDir/dr-doc-search" / pdf_file_name / "scanned"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for image_path in self.pdf_images_path.glob("*.png"):
|
||||
image_name = image_path.stem
|
||||
text_path = output_dir / f"{image_name}"
|
||||
if text_path.exists():
|
||||
continue
|
||||
tesseract_command = f"tesseract {image_path} {text_path} --oem 1 -l eng"
|
||||
run_command(tesseract_command)
|
||||
|
||||
return {"pages_text_path": output_dir}
|
@ -0,0 +1,28 @@
|
||||
from pathlib import Path
|
||||
|
||||
from py_executable_checklist.workflow import WorkflowBase, run_command
|
||||
|
||||
|
||||
class ConvertPDFToImages(WorkflowBase):
|
||||
"""
|
||||
Convert PDF to images using ImageMagick
|
||||
"""
|
||||
|
||||
input_pdf_path: Path
|
||||
pdf_pages: int
|
||||
app_dir: Path
|
||||
|
||||
def execute(self) -> dict:
|
||||
pdf_file_name = self.input_pdf_path.stem
|
||||
output_dir = self.app_dir / "OutputDir/dr-doc-search" / pdf_file_name / "images"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for i in range(self.pdf_pages):
|
||||
input_file_page = f"{self.input_pdf_path}[{i}]"
|
||||
image_path = output_dir / f"output-{i}.png"
|
||||
if image_path.exists():
|
||||
continue
|
||||
convert_command = f"""convert -density 150 -trim -background white -alpha remove -quality 100 -sharpen 0x1.0 {input_file_page} -quality 100 {image_path}"""
|
||||
run_command(convert_command)
|
||||
|
||||
return {"pdf_images_path": output_dir}
|
@ -0,0 +1,20 @@
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from py_executable_checklist.workflow import run_workflow
|
||||
|
||||
from doc_search.workflow import ConvertImagesToText
|
||||
|
||||
|
||||
def test_images_to_text() -> None:
|
||||
context: dict[str, Any] = {
|
||||
"input_pdf_path": Path("tests/data/input.pdf"),
|
||||
"pdf_images_path": Path("tests/data/images/"),
|
||||
"app_dir": Path(".") / "tests",
|
||||
}
|
||||
expected_output_path = Path("tests/OutputDir/dr-doc-search/input/scanned")
|
||||
|
||||
run_workflow(context, [ConvertImagesToText])
|
||||
|
||||
assert context["pages_text_path"] == expected_output_path
|
||||
assert len(list(expected_output_path.glob("*.txt"))) == 2
|
@ -0,0 +1,20 @@
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from py_executable_checklist.workflow import run_workflow
|
||||
|
||||
from doc_search.workflow import ConvertPDFToImages
|
||||
|
||||
|
||||
def test_convert_pdf_to_pages() -> None:
|
||||
context: dict[str, Any] = {
|
||||
"input_pdf_path": Path("tests/data/input.pdf"),
|
||||
"pdf_pages": 2,
|
||||
"app_dir": Path(".") / "tests",
|
||||
}
|
||||
expected_output_path = Path("tests/OutputDir/dr-doc-search/input/images")
|
||||
|
||||
run_workflow(context, [ConvertPDFToImages])
|
||||
|
||||
assert context["pdf_images_path"] == expected_output_path
|
||||
assert len(list(expected_output_path.glob("*.png"))) == 2
|
Binary file not shown.
After Width: | Height: | Size: 31 KiB |
Binary file not shown.
After Width: | Height: | Size: 25 KiB |
@ -1,7 +0,0 @@
|
||||
from doc_search.workflow import VerifyInputFile, workflow_steps
|
||||
|
||||
|
||||
def test_return_expected_workflow() -> None:
|
||||
expected_workflow_steps = workflow_steps()
|
||||
|
||||
assert expected_workflow_steps == [VerifyInputFile]
|
@ -0,0 +1,20 @@
|
||||
from doc_search.workflow import (
|
||||
ConvertImagesToText,
|
||||
ConvertPDFToImages,
|
||||
VerifyInputFile,
|
||||
workflow_steps,
|
||||
)
|
||||
|
||||
|
||||
def test_return_expected_workflow() -> None:
|
||||
expected_workflow_steps = workflow_steps()
|
||||
|
||||
assert expected_workflow_steps == [
|
||||
VerifyInputFile,
|
||||
ConvertPDFToImages,
|
||||
ConvertImagesToText,
|
||||
# LoadExistingIndex,
|
||||
# CreateIndexIfNotExists,
|
||||
# FindInterestingSections,
|
||||
# AskQuestion
|
||||
]
|
Loading…
Reference in New Issue