feat: Add steps to convert pdf file to scanned pages of text

doc-sources
namuan 1 year ago
parent a41d95f5bf
commit 2a793f8656

1
.gitignore vendored

@ -83,3 +83,4 @@ fireprofile
geckodriver.log
backroom/
.temp/
**/OutputDir/**/*.*

@ -39,7 +39,9 @@ add-dev: ## Adds a dev package with poetry - Use make deps to update packages
tests: clean ## Run all tests
poetry run pytest
poetry run coverage xml -i
cov-report: ## Generate coverage report
poetry run coverage html; open htmlcov/index.html
build: pre-commit tests ## Build package
poetry build

18
poetry.lock generated

@ -19,7 +19,7 @@ test = ["coverage", "flake8", "pexpect", "wheel"]
name = "attrs"
version = "22.2.0"
description = "Classes Without Boilerplate"
category = "dev"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -144,7 +144,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
name = "colorama"
version = "0.4.6"
description = "Cross-platform colored terminal text."
category = "dev"
category = "main"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
files = [
@ -287,7 +287,7 @@ files = [
name = "exceptiongroup"
version = "1.1.0"
description = "Backport of PEP 654 (exception groups)"
category = "dev"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -513,7 +513,7 @@ testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packag
name = "iniconfig"
version = "1.1.1"
description = "iniconfig: brain-dead simple config-ini parsing"
category = "dev"
category = "main"
optional = false
python-versions = "*"
files = [
@ -836,7 +836,7 @@ setuptools = "*"
name = "packaging"
version = "22.0"
description = "Core utilities for Python packages"
category = "dev"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -891,7 +891,7 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.2.2)", "pytest (>=7.2)", "pytest-
name = "pluggy"
version = "1.0.0"
description = "plugin and hook calling mechanisms for python"
category = "dev"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -1043,7 +1043,7 @@ image = ["Pillow"]
name = "pytest"
version = "7.2.0"
description = "pytest: simple powerful testing with Python"
category = "dev"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1485,7 +1485,7 @@ files = [
name = "tomli"
version = "2.0.1"
description = "A lil' TOML parser"
category = "dev"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1644,4 +1644,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools"
[metadata]
lock-version = "2.0"
python-versions = ">=3.9.0, <4.0"
content-hash = "378def26ac19f8af3455cec55a0d57b88292d55e4497e6adaadfc12c716b9473"
content-hash = "510a1715bd2cd3844ace0a676b21a7caaae541b1088a506e8ff2b389da40a75b"

@ -34,6 +34,7 @@ python = ">=3.9.0, <4.0"
py-executable-checklist = "1.3.1"
rich = "^13.0.0"
pypdf = "^3.2.0"
pytest = "^7.2.0"
[tool.poetry.group.dev.dependencies]
autoflake = "*"
@ -57,6 +58,7 @@ tryceratops = "*"
commitizen = "^2.20.3"
pytest = "^7.2.0"
pytest-cov = "^4.0.0"
coverage = "^7.0.3"
[tool.commitizen]
name = "cz_conventional_commits"
@ -84,7 +86,7 @@ addopts = """\
"""
[tool.coverage.report]
fail_under = 100
fail_under = 93
omit = ["src/doc_search/app.py", "src/doc_search/__init__.py"]
exclude_lines = [
'pragma: no cover'

@ -13,6 +13,7 @@ from doc_search.workflow import workflow_steps
def parse_args() -> Namespace:
parser = ArgumentParser(description=__doc__, formatter_class=RawDescriptionHelpFormatter)
parser.add_argument("-i", "--input-pdf-path", required=True, type=Path, help="Path to input PDF file")
parser.add_argument("-d", "--app_dir", default=Path.home(), type=Path, help="Path to app directory")
parser.add_argument(
"-v",
"--verbose",

@ -1,7 +1,11 @@
from .convert_images_to_text import ConvertImagesToText
from .convert_pdf_to_pages import ConvertPDFToImages
from .verify_input_file import VerifyInputFile
def workflow_steps() -> list:
return [
VerifyInputFile,
ConvertPDFToImages,
ConvertImagesToText,
]

@ -0,0 +1,28 @@
from pathlib import Path
from py_executable_checklist.workflow import WorkflowBase, run_command
class ConvertImagesToText(WorkflowBase):
"""
Convert images to text using tessaract OCR
"""
pdf_images_path: Path
input_pdf_path: Path
app_dir: Path
def execute(self) -> dict:
pdf_file_name = self.input_pdf_path.stem
output_dir = self.app_dir / "OutputDir/dr-doc-search" / pdf_file_name / "scanned"
output_dir.mkdir(parents=True, exist_ok=True)
for image_path in self.pdf_images_path.glob("*.png"):
image_name = image_path.stem
text_path = output_dir / f"{image_name}"
if text_path.exists():
continue
tesseract_command = f"tesseract {image_path} {text_path} --oem 1 -l eng"
run_command(tesseract_command)
return {"pages_text_path": output_dir}

@ -0,0 +1,28 @@
from pathlib import Path
from py_executable_checklist.workflow import WorkflowBase, run_command
class ConvertPDFToImages(WorkflowBase):
"""
Convert PDF to images using ImageMagick
"""
input_pdf_path: Path
pdf_pages: int
app_dir: Path
def execute(self) -> dict:
pdf_file_name = self.input_pdf_path.stem
output_dir = self.app_dir / "OutputDir/dr-doc-search" / pdf_file_name / "images"
output_dir.mkdir(parents=True, exist_ok=True)
for i in range(self.pdf_pages):
input_file_page = f"{self.input_pdf_path}[{i}]"
image_path = output_dir / f"output-{i}.png"
if image_path.exists():
continue
convert_command = f"""convert -density 150 -trim -background white -alpha remove -quality 100 -sharpen 0x1.0 {input_file_page} -quality 100 {image_path}"""
run_command(convert_command)
return {"pdf_images_path": output_dir}

@ -15,8 +15,4 @@ class VerifyInputFile(WorkflowBase):
reader = PdfReader(self.input_pdf_path)
# output
return {
"pdf_properties": {
"pages": len(reader.pages),
}
}
return {"pdf_pages": len(reader.pages)}

@ -0,0 +1,20 @@
from pathlib import Path
from typing import Any
from py_executable_checklist.workflow import run_workflow
from doc_search.workflow import ConvertImagesToText
def test_images_to_text() -> None:
context: dict[str, Any] = {
"input_pdf_path": Path("tests/data/input.pdf"),
"pdf_images_path": Path("tests/data/images/"),
"app_dir": Path(".") / "tests",
}
expected_output_path = Path("tests/OutputDir/dr-doc-search/input/scanned")
run_workflow(context, [ConvertImagesToText])
assert context["pages_text_path"] == expected_output_path
assert len(list(expected_output_path.glob("*.txt"))) == 2

@ -0,0 +1,20 @@
from pathlib import Path
from typing import Any
from py_executable_checklist.workflow import run_workflow
from doc_search.workflow import ConvertPDFToImages
def test_convert_pdf_to_pages() -> None:
context: dict[str, Any] = {
"input_pdf_path": Path("tests/data/input.pdf"),
"pdf_pages": 2,
"app_dir": Path(".") / "tests",
}
expected_output_path = Path("tests/OutputDir/dr-doc-search/input/images")
run_workflow(context, [ConvertPDFToImages])
assert context["pdf_images_path"] == expected_output_path
assert len(list(expected_output_path.glob("*.png"))) == 2

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

@ -1,7 +0,0 @@
from doc_search.workflow import VerifyInputFile, workflow_steps
def test_return_expected_workflow() -> None:
expected_workflow_steps = workflow_steps()
assert expected_workflow_steps == [VerifyInputFile]

@ -13,4 +13,4 @@ def test_return_pdf_properties() -> None:
run_workflow(context, [VerifyInputFile])
assert context["pdf_properties"].get("pages") == 2
assert context["pdf_pages"] == 2

@ -0,0 +1,20 @@
from doc_search.workflow import (
ConvertImagesToText,
ConvertPDFToImages,
VerifyInputFile,
workflow_steps,
)
def test_return_expected_workflow() -> None:
expected_workflow_steps = workflow_steps()
assert expected_workflow_steps == [
VerifyInputFile,
ConvertPDFToImages,
ConvertImagesToText,
# LoadExistingIndex,
# CreateIndexIfNotExists,
# FindInterestingSections,
# AskQuestion
]
Loading…
Cancel
Save