class Document(Serializable):
"""Class for storing a piece of text and associated metadata."""
page_content: str
"""String text."""
metadata: dict = Field(default_factory=dict)
"""Arbitrary metadata about the page content (e.g., source, relationships to other
documents, etc.).
"""
@property
def lc_serializable(self) -> bool:
"""Return whether or not the class is serializable."""
return True
def load(self) -> List[Document]:
"""Load from file path."""
text = ""
try:
with open(self.file_path, encoding=self.encoding) as f:
text = f.read()
except UnicodeDecodeError as e:
if self.autodetect_encoding:
detected_encodings = detect_file_encodings(self.file_path)
for encoding in detected_encodings:
logger.debug(f"Trying encoding: {encoding.encoding}")
try:
with open(self.file_path, encoding=encoding.encoding) as f:
text = f.read()
break
except UnicodeDecodeError:
continue
else:
raise RuntimeError(f"Error loading {self.file_path}") from e
except Exception as e:
raise RuntimeError(f"Error loading {self.file_path}") from e
metadata = {"source": self.file_path}
return [Document(page_content=text, metadata=metadata)]
from langchain.document_loaders import TextLoader
loader = TextLoader("./index.html")
loader.load()
from langchain.document_loaders.csv_loader import CSVLoader
loader = CSVLoader(file_path='./index.csv')
data = loader.load()
loader = CSVLoader(file_path='./index.csv', source_column="班级")
data = loader.load()
from langchain.document_loaders import UnstructuredHTMLLoader
loader = UnstructuredHTMLLoader("index.html")
data = loader.load()
from langchain.document_loaders import BSHTMLLoader
loader = BSHTMLLoader("index.html")
data = loader.load()
data
from langchain.document_loaders import JSONLoader
import json
from pathlib import Path
from pprint import pprint
file_path='./index.json'
data = json.loads(Path(file_path).read_text())