from haystack import Document
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.document_stores.in_memory import InMemoryDocumentStore
document_store = InMemoryDocumentStore()
documents = [
Document(content="There are over 7,000 languages spoken around the world today."),
Document(content="Elephants have been observed to behave in a way that indicates
a high level of self-awareness, such as recognizing themselves in mirrors."),
Document(content="In certain parts of the world, like the Maldives, Puerto Rico,
and San Diego, you can witness the phenomenon of bioluminescent waves.")
]
document_store.write_documents(documents=documents)
retriever = InMemoryBM25Retriever(document_store=document_store)
docs = retriever.run(query="How many languages are spoken around the world today?")["documents"]
for doc in docs:
print(f"content: {doc.content}")
print(f"score: {doc.score}")
输出
content: There are over 7,000 languages spoken around the world today.
score: 7.815769833242408
content: In certain parts of the world, like the Maldives, Puerto Rico, and San Diego,
you can witness the phenomenon of bioluminescent waves.
score: 4.314753296196667
content: Elephants have been observed to behave in a way that indicates a high level
of self-awareness, such as recognizing themselves in mirrors.
score: 3.652595952218814
最近几年,一种基于BERT架构衍生出来的多种语义检索技术被更多地用到了RAG中,他是一种encoder-only的transformer架构:
密集嵌入检索器基于双编码器(Bi-Encoder)架构,在BERT上面外加一层池化层(Pooling),得到单一的句向量,存储到document.embedding中。
from haystack import Document, Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import (
SentenceTransformersTextEmbedder,
SentenceTransformersDocumentEmbedder,
)
from haystack.components.retrievers import InMemoryEmbeddingRetriever
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
documents = [
Document(content="There are over 7,000 languages spoken around the world today."),
Document(content="Elephants have been observed to behave in a way that indicates
a high level of self-awareness, such as recognizing themselves in mirrors."),
Document(content="In certain parts of the world, like the Maldives, Puerto Rico,
and San Diego, you can witness the phenomenon of bioluminescent waves."),
]
document_embedder = SentenceTransformersDocumentEmbedder(
model="sentence-transformers/all-MiniLM-L6-v2"
)
document_embedder.warm_up()
documents_with_embeddings = document_embedder.run(documents)["documents"]
document_store.write_documents(documents_with_embeddings)
for doc in documents_with_embeddings:
print(f"content: {doc.content}")
print(f"score: {doc.score}")
print(f"embedding: {doc.embedding}\n")
content: There are over 7,000 languages spoken around the world today.
score: None
embedding: [0.03276507928967476, ..., 0.022160163149237633]
content: Elephants have been observed to behave in a way that indicates
a high level of self-awareness, such as recognizing themselves in mirrors.
score: None
embedding: [0.01985647901892662, ..., 0.007489172276109457]
content: In certain parts of the world, like the Maldives, Puerto Rico,
and San Diego, you can witness the phenomenon of bioluminescent waves.
score: None
embedding: [0.08535218983888626, ..., 0.013049677945673466]
query_pipeline = Pipeline()
query_pipeline.add_component(
"text_embedder",
SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
)
query_pipeline.add_component(
"retriever", InMemoryEmbeddingRetriever(document_store=document_store)
)
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
query = "How many languages are there?"
result = query_pipeline.run({"text_embedder": {"text": query}})
result_documents = result["retriever"]["documents"]
for doc in result_documents:
print(f"content: {doc.content}")
print(f"score: {doc.score}\n")
content: There are over 7,000 languages spoken around the world today.
score: 0.7557791921810213
content: Elephants have been observed to behave in a way that indicates
a high level of self-awareness, such as recognizing themselves in mirrors.
score: 0.04221229572888512
content: In certain parts of the world, like the Maldives, Puerto Rico,
and San Diego, you can witness the phenomenon of bioluminescent waves.
score: -0.001667837080811814
from haystack import Document
from haystack.components.rankers import TransformersSimilarityRanker
documents = [
Document(content="There are over 7,000 languages spoken around the world today."),
Document(content="Elephants have been observed to behave in a way that indicates
a high level of self-awareness, such as recognizing themselves in mirrors."),
Document(content="In certain parts of the world, like the Maldives, Puerto Rico,
and San Diego, you can witness the phenomenon of bioluminescent waves."),
]
ranker = TransformersSimilarityRanker(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
ranker.warm_up()
query = "How many languages are there?"
ranked_documents = ranker.run(query=query, documents=documents)["documents"]
for doc in ranked_documents:
print(f"content: {doc.content}")
print(f"score: {doc.score}\n")
content: There are over 7,000 languages spoken around the world today.
score: 0.9998884201049805
content: Elephants have been observed to behave in a way that indicates
a high level of self-awareness, such as recognizing themselves in mirrors.
score: 1.4616251974075567e-05
content: In certain parts of the world, like the Maldives, Puerto Rico,
and San Diego, you can witness the phenomenon of bioluminescent waves.
score: 1.4220857337932102e-05
挑一种文档划分方法,再挑一个检索器,一个简单的RAG就可以完成了
from prompt_toolkit import prompt
from haystack import Pipeline
from haystack.utils import Secret
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.embedders import (
SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder,)
document_store = InMemoryDocumentStore()
fetcher = LinkContentFetcher()
converter = HTMLToDocument()
splitter = DocumentSplitter(split_by="sentence", split_length=3, split_overlap=1)
document_embedder = SentenceTransformersDocumentEmbedder(
model="sentence-transformers/all-MiniLM-L6-v2"
)
writer = DocumentWriter(document_store = document_store)
indexing_pipeline = Pipeline()
indexing_pipeline.add_component("fetcher", fetcher)
indexing_pipeline.add_component("converter", converter)
indexing_pipeline.add_component("splitter", splitter)
indexing_pipeline.add_component("document_embedder", document_embedder)
indexing_pipeline.add_component("writer", writer)
indexing_pipeline.connect("fetcher.streams", "converter.sources")
indexing_pipeline.connect("converter.documents", "splitter.documents")
indexing_pipeline.connect("splitter.documents", "document_embedder.documents")
indexing_pipeline.connect("document_embedder.documents", "writer.documents")
indexing_pipeline.run(data={"fetcher": {"urls": ["https://en.wikipedia.org/wiki/Nanjing_University"]}})
prompt_template = """
Given these documents, answer the question.
Documents:
{% for doc in documents %}
{{ doc.content }}
{% endfor %}
Question: {{question}}
Answer:
"""
api_key = "xxx"
model = "gpt-4o-mini"
api_base_url = None
query_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
retriever = InMemoryEmbeddingRetriever(document_store=document_store)
prompt_builder = PromptBuilder(template=prompt_template)
llm = OpenAIGenerator(
api_key=Secret.from_token(api_key),
model=model,
api_base_url=api_base_url
)
rag_pipeline = Pipeline()
rag_pipeline.add_component("query_embedder", query_embedder)
rag_pipeline.add_component("retriever", retriever)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component("llm", llm)
rag_pipeline.connect("query_embedder.embedding", "retriever.query_embedding")
rag_pipeline.connect("retriever.documents", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder.prompt", "llm.prompt")
while(True):
question = prompt("> ")
results = rag_pipeline.run(
{
"query_embedder": {"text": question},
"prompt_builder": {"question": question},
}
)
reply = results["llm"]["replies"][0]
print(reply)
What is the motto of Nanjing University?
The motto of Nanjing University is "诚朴雄伟励学敦行," which translates to "Sincerity with Aspiration, Perseverance and Integrity" in English. The first half of this motto was the motto during the National Central University time, and the last half was quoted from the classic literature work Book of Rites.
What is the song of Nanjing University?
The song of Nanjing University is the university song, which was created in 1916. It is the first school song in the modern history of Nanjing University. The lyrics were written by Jiang Qian, and the melody was composed by Li Shutong. The song was recovered in 2002.
question: Who is the modern China's first PhD in Chinese Language and Literature?
The modern China's first PhD in Chinese Language and Literature is Mo Lifeng (莫砺锋), as mentioned in the documents.
from haystack import Document, Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import (
SentenceTransformersTextEmbedder,
SentenceTransformersDocumentEmbedder,
)
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.components.joiners.document_joiner import DocumentJoiner
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
query = "What are effective strategies to improve English speaking skills?"
documents = [
Document(content="Practicing with native speakers enhances English
speaking proficiency."),
Document(content="Regular participation in debates and discussions
refine public speaking skills in English."),
Document(content="Studying the history of the English language does
not directly improve speaking skills."),
]
document_embedder = SentenceTransformersDocumentEmbedder(
model="sentence-transformers/all-MiniLM-L6-v2"
)
document_embedder.warm_up()
documents_with_embeddings = document_embedder.run(documents)["documents"]
document_store.write_documents(documents_with_embeddings)
bm25_retriever = InMemoryBM25Retriever(document_store=document_store,scale_score=True)
bm25_docs = bm25_retriever.run(query=query)["documents"]
print("bm25:")
for doc in bm25_docs:
print(f"content: {doc.content}")
print(f"score: {doc.score}\n")
content: Studying the history of the English language does not directly improve
speaking skills.
score: 0.5593245377361279
content: Regular participation in debates and discussions refine public speaking
skills in English.
score: 0.545159185512614
content: Practicing with native speakers enhances English speaking proficiency.
score: 0.5387709786621966
query_pipeline = Pipeline()
query_pipeline.add_component(
"text_embedder",
SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
)
query_pipeline.add_component(
"dense_retriever", InMemoryEmbeddingRetriever(document_store=document_store,scale_score=True)
)
query_pipeline.connect("text_embedder.embedding", "dense_retriever.query_embedding")
dense_docs = query_pipeline.run({"text_embedder": {"text": query}})["dense_retriever"]["documents"]
print("dense:")
for doc in dense_docs:
print(f"content: {doc.content}")
print(f"score: {doc.score}\n")
content: Practicing with native speakers enhances English speaking proficiency.
score: 0.8296398226909952
content: Regular participation in debates and discussions refine public speaking
skills in English.
score: 0.8017774366152697
content: Studying the history of the English language does not directly improve
speaking skills.
score: 0.7334273104138469
joiner = DocumentJoiner(join_mode="merge", weights=[0.3, 0.7])
merge_docs = joiner.run(documents=[bm25_docs, dense_docs])["documents"]
for doc in merge_docs:
print(f"content: {doc.content}")
print(f"score: {doc.score}\n")
content: Practicing with native speakers enhances English speaking proficiency.
score: 0.7423791694823556
content: Regular participation in debates and discussions refine public speaking
skills in English.
score: 0.724791961284473
content: Studying the history of the English language does not directly improve
speaking skills.
score: 0.6811964786105311
joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion")
rrf_docs = joiner.run(documents=[bm25_docs,dense_docs])["documents"]
print("rrf:")
for doc in rrf_docs:
print(f"content: {doc.content}")
print(f"score: {doc.score}\n")
content: Studying the history of the English language does not directly improve speaking skills.
score: 0.9841269841269842
content: Practicing with native speakers enhances English
speaking proficiency.
score: 0.9841269841269842
content: Regular participation in debates and discussions refine public speaking
skills in English.
score: 0.9838709677419354
RRF计算:haystack使用k=61,并且进行了额外的放缩处理,
from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.components.rankers import TransformersSimilarityRanker
query = "What are effective strategies to improve English speaking skills?"
documents = [
Document(
content="Practicing with native speakers enhances English speaking proficiency."
),
Document(
content="Daily vocabulary expansion is crucial for improving oral communication skills."
),
Document(
content="Engaging in language exchange programs can significantly boost speaking abilities."
),
Document(
content="Regular participation in debates and discussions refine public speaking skills in English."
),
Document(
content="Studying the history of the English language does not directly improve speaking skills."
),
]
document_store = InMemoryDocumentStore()
document_store.write_documents(documents)
bm25_retriever = InMemoryBM25Retriever(document_store=document_store)
bm25_docs = bm25_retriever.run(query=query, top_k=4)["documents"]
print("bm25:")
for doc in bm25_docs:
print(f"content: {doc.content}")
print(f"score: {doc.score}\n")
bm25:
content: Studying the history of the English language does not directly improve speaking skills.
score: 3.1117211646172698
content: Regular participation in debates and discussions refine public speaking skills in English.
score: 2.443788686074245
content: Practicing with native speakers enhances English speaking proficiency.
score: 2.2622329312889553
content: Daily vocabulary expansion is crucial for improving oral communication skills.
score: 2.0359854825047066
reranker = TransformersSimilarityRanker(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
reranker.warm_up()
reranked_docs = reranker.run(query=query, documents=bm25_docs, top_k=3)["documents"]
print("reranker:")
for doc in reranked_docs:
print(f"content: {doc.content}")
print(f"score: {doc.score}\n")
reranker:
content: Practicing with native speakers enhances English speaking proficiency.
score: 0.769904375076294
content: Studying the history of the English language does not directly improve
speaking skills.
score: 0.5486361384391785
content: Daily vocabulary expansion is crucial for improving oral communication
skills.
score: 0.3509156107902527
小文档块的检索准确度更高,但丢失了更多上下文信息,因此可以在检索后丰富上下文来补偿
https://marp.app/
--- ## 简单以句子为单位切分 ```python simple_splitter = DocumentSplitter(split_by="sentence", split_length=1, split_overlap=0) simple_docs = simple_splitter.run(documents=[document])["documents"] print("\nsimple:") for index, doc in enumerate(simple_docs): print(f"document_{index}: {doc.content}") ```