community: Add MarkdownifyTransformer to langchain_community.document_transformers (#21247)

- Added new document_transformer: MarkdonifyTransformer, that uses
`markdonify` package with customizable options to convert HTML to
Markdown. It's similar to Html2TextTransformer, but has more flexible
options and also I've noticed that sometimes MarkdownifyTransformer
performs better than html2text one, so that's why I use markdownify on
my project.
- Added docs and tests

- Usage:
```python
from langchain_community.document_transformers import MarkdownifyTransformer

markdownify = MarkdownifyTransformer()
docs_transform = markdownify.transform_documents(docs)
```

- Example of better performance on simple task, that I've noticed:
```
<html>
<head><title>Reports on product movement</title></head>
<body>
<p data-block-key="2wst7">The reports on product movement will be useful for forming supplier orders and controlling outcomes.</p>
</body>
```
**Html2TextTransformer**: 
```python
[Document(page_content='The reports on product movement will be useful for forming supplier orders and\ncontrolling outcomes.\n\n')]
# Here we can see 'and\ncontrolling', which has extra '\n' in it
```
**MarkdownifyTranformer**:
```python
[Document(page_content='Reports on product movement\n\nThe reports on product movement will be useful for forming supplier orders and controlling outcomes.')]
```

---------

Co-authored-by: Sokolov Fedor <f.sokolov@sokolov-macbook.bbrouter>
Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
Co-authored-by: Sokolov Fedor <f.sokolov@sokolov-macbook.local>
Co-authored-by: Sokolov Fedor <f.sokolov@192.168.1.6>
pull/21454/head
Sokolov Fedor 2 weeks ago committed by GitHub
parent d3ce6aad2e
commit f4ddf64faa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

File diff suppressed because one or more lines are too long

@ -45,6 +45,9 @@ if TYPE_CHECKING:
from langchain_community.document_transformers.long_context_reorder import (
LongContextReorder,
)
from langchain_community.document_transformers.markdownify import (
MarkdownifyTransformer,
)
from langchain_community.document_transformers.nuclia_text_transform import (
NucliaTextTransformer,
)
@ -62,6 +65,7 @@ __all__ = [
"GoogleTranslateTransformer",
"Html2TextTransformer",
"LongContextReorder",
"MarkdownifyTransformer",
"NucliaTextTransformer",
"OpenAIMetadataTagger",
"get_stateful_documents",
@ -77,6 +81,7 @@ _module_lookup = {
"GoogleTranslateTransformer": "langchain_community.document_transformers.google_translate", # noqa: E501
"Html2TextTransformer": "langchain_community.document_transformers.html2text",
"LongContextReorder": "langchain_community.document_transformers.long_context_reorder", # noqa: E501
"MarkdownifyTransformer": "langchain_community.document_transformers.markdownify", # noqa: E501
"NucliaTextTransformer": "langchain_community.document_transformers.nuclia_text_transform", # noqa: E501
"OpenAIMetadataTagger": "langchain_community.document_transformers.openai_functions", # noqa: E501
"get_stateful_documents": "langchain_community.document_transformers.embeddings_redundant_filter", # noqa: E501

@ -0,0 +1,83 @@
import re
from typing import Any, List, Optional, Sequence, Union
from langchain_core.documents import BaseDocumentTransformer, Document
class MarkdownifyTransformer(BaseDocumentTransformer):
"""Converts HTML documents to Markdown format with customizable options for handling
links, images, other tags and heading styles using the markdownify library.
Arguments:
strip: A list of tags to strip. This option can't be used with the convert option.
convert: A list of tags to convert. This option can't be used with the strip option.
autolinks: A boolean indicating whether the "automatic link" style should be used when a a tag's contents match its href. Defaults to True.
heading_style: Defines how headings should be converted. Accepted values are ATX, ATX_CLOSED, SETEXT, and UNDERLINED (which is an alias for SETEXT). Defaults to ATX.
**kwargs: Additional options to pass to markdownify.
Example:
.. code-block:: python
from langchain_community.document_transformers import MarkdownifyTransformer
markdownify = MarkdownifyTransformer()
docs_transform = markdownify.transform_documents(docs)
More configuration options can be found at the markdownify GitHub page:
https://github.com/matthewwithanm/python-markdownify
""" # noqa: E501
def __init__(
self,
strip: Optional[Union[str, List[str]]] = None,
convert: Optional[Union[str, List[str]]] = None,
autolinks: bool = True,
heading_style: str = "ATX",
**kwargs: Any,
) -> None:
self.strip = [strip] if isinstance(strip, str) else strip
self.convert = [convert] if isinstance(convert, str) else convert
self.autolinks = autolinks
self.heading_style = heading_style
self.additional_options = kwargs
def transform_documents(
self,
documents: Sequence[Document],
**kwargs: Any,
) -> Sequence[Document]:
try:
from markdownify import markdownify
except ImportError:
raise ImportError(
"""markdownify package not found, please
install it with `pip install markdownify`"""
)
converted_documents = []
for doc in documents:
markdown_content = (
markdownify(
html=doc.page_content,
strip=self.strip,
convert=self.convert,
autolinks=self.autolinks,
heading_style=self.heading_style,
**self.additional_options,
)
.replace("\xa0", " ")
.strip()
)
cleaned_markdown = re.sub(r"\n\s*\n", "\n\n", markdown_content)
converted_documents.append(
Document(cleaned_markdown, metadata=doc.metadata)
)
return converted_documents
async def atransform_documents(
self,
documents: Sequence[Document],
**kwargs: Any,
) -> Sequence[Document]:
raise NotImplementedError

@ -13,6 +13,7 @@ EXPECTED_ALL = [
"NucliaTextTransformer",
"OpenAIMetadataTagger",
"Html2TextTransformer",
"MarkdownifyTransformer",
]

@ -0,0 +1,138 @@
"""Unit tests for markdownify document transformer."""
import pytest
from langchain_core.documents import Document
from langchain_community.document_transformers import MarkdownifyTransformer
@pytest.mark.requires("markdownify")
def test_empty_html() -> None:
markdownify = MarkdownifyTransformer()
empty_html = "<html></html>"
documents = [Document(page_content=empty_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == ""
@pytest.mark.requires("markdownify")
def test_extract_paragraphs() -> None:
markdownify = MarkdownifyTransformer()
paragraphs_html = (
"<html><h1>Header</h1><p>First paragraph.</p>"
"<p>Second paragraph.</p><h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == (
"# Header\n\n" "First paragraph.\n\n" "Second paragraph.\n\n" "# Ignore at end"
)
@pytest.mark.requires("markdownify")
def test_extract_html() -> None:
markdownify = MarkdownifyTransformer(skip="title")
basic_html = (
"<!DOCTYPE html>"
'<html lang="en">'
"<head>"
' <meta charset="UTF-8">'
" <title>Simple Test Page</title>"
"</head>"
"<body>"
" <h1>Test Header</h1>"
" <p>First paragraph.</p>"
" <p>Second paragraph.</p>"
' <a href="https://example.com">Example Link</a>'
"</body>"
"</html>"
)
documents = [Document(page_content=basic_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == (
"Simple Test Page # Test Header\n\n "
"First paragraph.\n\n "
"Second paragraph.\n\n "
"[Example Link](https://example.com)"
)
@pytest.mark.requires("markdownify")
def test_strip_tags() -> None:
markdownify = MarkdownifyTransformer(strip="strong")
paragraphs_html = (
"<html>"
"<h1>Header</h1>"
" <p><strong>1st paragraph.</strong></p>"
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
' <img src="image.jpg" alt="Sample Image">'
"<h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == (
"# Header\n\n "
"1st paragraph.\n\n "
"2nd paragraph. Here is [link](http://example.com)\n\n "
"![Sample Image](image.jpg)"
"# Ignore at end"
)
markdownify = MarkdownifyTransformer(strip=["strong", "a", "img"])
documents = [Document(page_content=paragraphs_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == (
"# Header\n\n "
"1st paragraph.\n\n "
"2nd paragraph. Here is link\n\n "
"# Ignore at end"
)
@pytest.mark.requires("markdownify")
def test_convert_tags() -> None:
markdownify = MarkdownifyTransformer(convert=["strong", "a"])
paragraphs_html = (
"<html>"
"<h1>Header</h1>"
" <p><strong>1st paragraph.</strong></p>"
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
' <img src="image.jpg" alt="Sample Image">'
"<h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == (
"Header "
"**1st paragraph.** "
"2nd paragraph. Here is [link](http://example.com) "
"Ignore at end"
)
markdownify = MarkdownifyTransformer(convert="p")
documents = [Document(page_content=paragraphs_html)]
docs_transformed = markdownify.transform_documents(documents)
assert docs_transformed[0].page_content == (
"Header "
"1st paragraph.\n\n "
"2nd paragraph. Here is link\n\n "
"Ignore at end"
)
@pytest.mark.requires("markdownify")
def test_strip_convert_conflict_error() -> None:
with pytest.raises(
ValueError,
match="You may specify either tags to strip or tags to convert, but not both.",
):
markdownify = MarkdownifyTransformer(strip="h1", convert=["strong", "a"])
paragraphs_html = (
"<html>"
"<h1>Header</h1>"
" <p><strong>1st paragraph.</strong></p>"
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
' <img src="image.jpg" alt="Sample Image">'
"<h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
markdownify.transform_documents(documents)
Loading…
Cancel
Save