Skip to content

Utils Module Documentation

Document Loaders

Base Class

BaseLoader

Bases: BaseModel, ABC

BaseLoader

load abstractmethod

load(path_or_uri: str, *, file: bytes | IO[bytes] | None = None, content_type: Optional[str] = None, **kwargs: Any) -> Document

从给定的 URI 或文件对象加载文档 / Load document from the given URI or file object.

Parameters:

Name Type Description Default
path_or_uri str

文档的路径或 URI (必填)/ File path or uri (required)

required
file bytes | IO[bytes] | None

文件内容,可以是 bytes 或 IO[bytes] 对象(与 path_or_uri 参数二选一)/ File content as bytes or IO[bytes] (mutually exclusive with path_or_uri).

None
content_type Optional[str]

文档的内容类型(MIME 类型)/ The content type of the document.

None
**kwargs Any

其他可选参数 / Additional keyword arguments.

{}

Returns:

Name Type Description
Document Document

加载的文档对象 / The loaded document.

Raises:

Type Description
ValueError

如果 path_or_uri 和 file 都未提供,或都提供了 / If neither or both path_or_uri and file are provided.

Source code in tfrobot/utils/document_loaders/base.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
@abstractmethod
def load(
    self,
    path_or_uri: str,
    *,
    file: bytes | IO[bytes] | None = None,
    content_type: Optional[str] = None,
    **kwargs: Any,
) -> Document:
    """
    从给定的 URI 或文件对象加载文档 / Load document from the given URI or file object.

    Args:
        path_or_uri: 文档的路径或 URI (必填)/ File path or uri (required)
        file: 文件内容,可以是 bytes 或 IO[bytes] 对象(与 path_or_uri 参数二选一)/ File content as bytes or IO[bytes] (mutually exclusive with path_or_uri).
        content_type: 文档的内容类型(MIME 类型)/ The content type of the document.
        **kwargs: 其他可选参数 / Additional keyword arguments.

    Returns:
        Document: 加载的文档对象 / The loaded document.

    Raises:
        ValueError: 如果 path_or_uri 和 file 都未提供,或都提供了 / If neither or both path_or_uri and file are provided.
    """
    pass

get_file_obj staticmethod

get_file_obj(url: str) -> BytesIO

将一个合法的file:// https:// http:// 对象转换为BytesIO

Parameters:

Name Type Description Default
url str

对象下载地址

required
Return

BytesIO[bytes]: 比特IO流

Source code in tfrobot/utils/document_loaders/base.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
@staticmethod
def get_file_obj(url: str) -> BytesIO:
    """
    将一个合法的file:// https:// http:// 对象转换为BytesIO

    Args:
        url (str): 对象下载地址

    Return:
        BytesIO[bytes]: 比特IO流
    """
    # 处理文件对象 / Handle file object
    file_content: Optional[bytes] = None
    with open_uri(url) as f:
        file_content = f.read()
    return BytesIO(file_content)

temp_url_download

temp_url_download(url: str) -> Generator[bytes, None, None]

Download a file from a given URL and yield the path to the temporary file. Deletes the temporary file after use.

:param url: The URL of the file to download. :return: The path to the temporary file. :raises ValueError: If unable to download the file.

Source code in tfrobot/utils/document_loaders/base.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
@contextmanager
def temp_url_download(url: str) -> Generator[bytes, None, None]:
    """
    Download a file from a given URL and yield the path to the temporary file.
    Deletes the temporary file after use.

    :param url: The URL of the file to download.
    :return: The path to the temporary file.
    :raises ValueError: If unable to download the file.
    """
    with httpx.Client() as client:
        response = client.get(url)
        if response.status_code == 200:
            yield response.content
        else:
            raise ValueError("Unable to download the file.")

open_uri

open_uri(uri: str, *, transport_params: Optional[dict] = None, **kwargs: Any) -> Generator[SeekableBufferedInputBase | BufferedReader, None, None]

Open a URI and yield the file handle. Closes the file handle after use.

If URI is a local file, opens the file handle directly. If URI is a remote file, downloads the URI content.

Note: This function is not thread-safe.

Parameters:

Name Type Description Default
uri str

The URI of the file to open.

required
transport_params Optional[dict]

The transport parameters for the file.

None
**kwargs Any

Additional keyword arguments to pass to the open function.

{}

Yields:

Type Description
SeekableBufferedInputBase | BufferedReader

The file handle.

Raises:

Type Description
ValueError

If unable to open the URI.

Source code in tfrobot/utils/document_loaders/base.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
@contextmanager
def open_uri(
    uri: str, *, transport_params: Optional[dict] = None, **kwargs: Any
) -> Generator[SeekableBufferedInputBase | BufferedReader, None, None]:
    """
    Open a URI and yield the file handle.
    Closes the file handle after use.

    If URI is a local file, opens the file handle directly.
    If URI is a remote file, downloads the URI content.

    Note: This function is not thread-safe.

    Args:
        uri: The URI of the file to open.
        transport_params: The transport parameters for the file.
        **kwargs: Additional keyword arguments to pass to the open function.

    Yields:
        The file handle.

    Raises:
        ValueError: If unable to open the URI.
    """
    # TODO 因为Yield的问题,好像在open_uri调用方如果出现异常,会导致异常在 本函数的except里捕获。这导致堆栈信息丢失以及一些奇怪的现象,需要优化
    try:
        uri = unquote(uri)
        with smart_open.open(uri, "rb", transport_params=transport_params, **kwargs) as f:
            yield f
    except (TypeError, ValueError) as e:
        raise ValueError(
            f"处理文件过程中发生异常 {uri}. 但是请注意,很多时候不是因为无法打开这个URI,而是甚为其它错误,但错误堆栈被Yield模式给吞没了,所以请仔细检查错误信息"
        ) from e  # pragma: no cover

load_image

load_image(file_or_url: str | Path | Url) -> Tuple[bytes, str]

加载图片文件或者URL,形成base64编码

Parameters:

Name Type Description Default
file_or_url str | Path | Url

图片文件路径或者URL

required

Returns:

Name Type Description
str Tuple[bytes, str]

图片的base64编码

Source code in tfrobot/utils/document_loaders/base.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def load_image(file_or_url: str | Path | Url) -> Tuple[bytes, str]:
    """
    加载图片文件或者URL,形成base64编码

    Args:
        file_or_url (str | Path | Url): 图片文件路径或者URL

    Returns:
        str: 图片的base64编码
    """
    import base64

    import magic
    from smart_open import open as sopen

    with sopen(file_or_url, "rb") as f:
        img = f.read()
        mime_type = magic.from_buffer(img, True)
        # 通过base64编码,形成bytes
        img = base64.b64encode(img)
    return img, mime_type