You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/tests/unit_tests/document_transformers/test_markdownify.py

139 lines
4.8 KiB
Python

"""Unit tests for markdownify document transformer."""
import pytest
from langchain_core.documents import Document
from langchain_community.document_transformers import MarkdownifyTransformer
@pytest.mark.requires("markdownify")
def test_empty_html() -> None:
markdownify = MarkdownifyTransformer()
empty_html = "<html></html>"
documents = [Document(page_content=empty_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == ""
@pytest.mark.requires("markdownify")
def test_extract_paragraphs() -> None:
markdownify = MarkdownifyTransformer()
paragraphs_html = (
"<html><h1>Header</h1><p>First paragraph.</p>"
"<p>Second paragraph.</p><h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == (
"# Header\n\n" "First paragraph.\n\n" "Second paragraph.\n\n" "# Ignore at end"
)
@pytest.mark.requires("markdownify")
def test_extract_html() -> None:
markdownify = MarkdownifyTransformer(skip="title")
basic_html = (
"<!DOCTYPE html>"
'<html lang="en">'
"<head>"
' <meta charset="UTF-8">'
" <title>Simple Test Page</title>"
"</head>"
"<body>"
" <h1>Test Header</h1>"
" <p>First paragraph.</p>"
" <p>Second paragraph.</p>"
' <a href="https://example.com">Example Link</a>'
"</body>"
"</html>"
)
documents = [Document(page_content=basic_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == (
"Simple Test Page # Test Header\n\n "
"First paragraph.\n\n "
"Second paragraph.\n\n "
"[Example Link](https://example.com)"
)
@pytest.mark.requires("markdownify")
def test_strip_tags() -> None:
markdownify = MarkdownifyTransformer(strip="strong")
paragraphs_html = (
"<html>"
"<h1>Header</h1>"
" <p><strong>1st paragraph.</strong></p>"
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
' <img src="image.jpg" alt="Sample Image">'
"<h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == (
"# Header\n\n "
"1st paragraph.\n\n "
"2nd paragraph. Here is [link](http://example.com)\n\n "
"![Sample Image](image.jpg)"
"# Ignore at end"
)
markdownify = MarkdownifyTransformer(strip=["strong", "a", "img"])
documents = [Document(page_content=paragraphs_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == (
"# Header\n\n "
"1st paragraph.\n\n "
"2nd paragraph. Here is link\n\n "
"# Ignore at end"
)
@pytest.mark.requires("markdownify")
def test_convert_tags() -> None:
markdownify = MarkdownifyTransformer(convert=["strong", "a"])
paragraphs_html = (
"<html>"
"<h1>Header</h1>"
" <p><strong>1st paragraph.</strong></p>"
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
' <img src="image.jpg" alt="Sample Image">'
"<h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == (
"Header "
"**1st paragraph.** "
"2nd paragraph. Here is [link](http://example.com) "
"Ignore at end"
)
markdownify = MarkdownifyTransformer(convert="p")
documents = [Document(page_content=paragraphs_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == (
"Header "
"1st paragraph.\n\n "
"2nd paragraph. Here is link\n\n "
"Ignore at end"
)
@pytest.mark.requires("markdownify")
def test_strip_convert_conflict_error() -> None:
with pytest.raises(
ValueError,
match="You may specify either tags to strip or tags to convert, but not both.",
):
markdownify = MarkdownifyTransformer(strip="h1", convert=["strong", "a"])
paragraphs_html = (
"<html>"
"<h1>Header</h1>"
" <p><strong>1st paragraph.</strong></p>"
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
' <img src="image.jpg" alt="Sample Image">'
"<h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
markdownify.transform_documents(documents)