专为企业级搜索设计,支持Elasticsearch、混合检索
灵活的组件化设计,可视化编排处理流程
REST API、监控、评估完整,支持大规模部署
pip install farm-haystack
from haystack import Pipeline
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.components.generators import OpenAIGenerator
from haystack.document_stores.in_memory import InMemoryDocumentStore
# 创建文档存储
document_store = InMemoryDocumentStore()
documents = [
{"content": "Haystack是一个NLP框架"},
{"content": "支持RAG和搜索应用"}
]
document_store.write_documents(documents)
# 创建Pipeline
pipeline = Pipeline()
pipeline.add_component("retriever", InMemoryBM25Retriever(document_store))
pipeline.add_component("llm", OpenAIGenerator())
# 连接
pipeline.connect("retriever", "llm")
# 运行
result = pipeline.run({"retriever": {"query": "什么是Haystack"}})
存储和管理文档
检索相关文档
重排序结果
生成答案
结合BM25关键词检索和Embedding语义检索,提升检索准确率30%+
from haystack import Pipeline
from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
from haystack.components.joiners import DocumentJoiner
from haystack.components.rankers import TransformersSimilarityRanker
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator
# 创建Pipeline
pipeline = Pipeline()
# 添加双路检索器
pipeline.add_component("bm25_retriever", InMemoryBM25Retriever(document_store, top_k=10))
pipeline.add_component("embedding_retriever", InMemoryEmbeddingRetriever(document_store, top_k=10))
# 合并检索结果
pipeline.add_component("joiner", DocumentJoiner(join_mode="merge"))
# 重排序(可选但推荐)
pipeline.add_component("ranker", TransformersSimilarityRanker(model="cross-encoder/ms-marco-MiniLM-L-6-v2", top_k=5))
# 构建提示词
template = """
基于以下文档回答问题:
{% for doc in documents %}
文档 {{ loop.index }}:
{{ doc.content }}
---
{% endfor %}
问题:{{ query }}
答案:
"""
pipeline.add_component("prompt_builder", PromptBuilder(template=template))
# 生成答案
pipeline.add_component("llm", OpenAIGenerator(model="gpt-4"))
# 连接所有组件
pipeline.connect("bm25_retriever.documents", "joiner.documents")
pipeline.connect("embedding_retriever.documents", "joiner.documents")
pipeline.connect("joiner.documents", "ranker.documents")
pipeline.connect("ranker.documents", "prompt_builder.documents")
pipeline.connect("prompt_builder.prompt", "llm.prompt")
# 运行查询
result = pipeline.run({
"bm25_retriever": {"query": "什么是Haystack"},
"embedding_retriever": {"query": "什么是Haystack"},
"ranker": {"query": "什么是Haystack"},
"prompt_builder": {"query": "什么是Haystack"}
})
print(result["llm"]["replies"][0])
Haystack与Elasticsearch深度集成,适合大规模企业级搜索应用
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
# 连接Elasticsearch
document_store = ElasticsearchDocumentStore(
host="localhost",
port=9200,
index="haystack_documents",
embedding_dim=768, # 根据你的embedding模型
similarity="cosine"
)
# 写入文档
from haystack import Document
docs = [
Document(content="Haystack是一个NLP框架", meta={"source": "doc1"}),
Document(content="支持企业级搜索", meta={"source": "doc2"})
]
document_store.write_documents(docs)
# 创建索引
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
docs_with_embeddings = embedder.run(docs)
document_store.write_documents(docs_with_embeddings)
from haystack.components.retrievers import ElasticsearchBM25Retriever, ElasticsearchEmbeddingRetriever
pipeline = Pipeline()
# BM25检索(关键词精确匹配)
pipeline.add_component("bm25", ElasticsearchBM25Retriever(document_store=document_store))
# Embedding检索(语义相似)
pipeline.add_component("embedding", ElasticsearchEmbeddingRetriever(document_store=document_store))
# 合并和重排序
pipeline.add_component("joiner", DocumentJoiner())
pipeline.add_component("ranker", TransformersSimilarityRanker(top_k=5))
# Prompt和生成
pipeline.add_component("prompt", PromptBuilder(template=qa_template))
pipeline.add_component("llm", OpenAIGenerator())
# 连接
pipeline.connect("bm25", "joiner")
pipeline.connect("embedding", "joiner")
pipeline.connect("joiner", "ranker")
pipeline.connect("ranker", "prompt.documents")
pipeline.connect("prompt", "llm")
# 查询
result = pipeline.run({
"bm25": {"query": "企业搜索方案"},
"embedding": {"query": "企业搜索方案"},
"ranker": {"query": "企业搜索方案"},
"prompt": {"query": "企业搜索方案"}
})
支持多种文档格式,混合检索,高性能搜索
from haystack import Pipeline
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
# 文档处理Pipeline
indexing_pipeline = Pipeline()
# 添加文档转换器
indexing_pipeline.add_component("pdf_converter", PyPDFToDocument())
indexing_pipeline.add_component("text_converter", TextFileToDocument())
# 清理和分割
indexing_pipeline.add_component("cleaner", DocumentCleaner())
indexing_pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=5))
# Embedding
indexing_pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder())
# 写入Elasticsearch
indexing_pipeline.add_component("writer", DocumentWriter(document_store=document_store))
# 连接组件
indexing_pipeline.connect("pdf_converter", "cleaner")
indexing_pipeline.connect("text_converter", "cleaner")
indexing_pipeline.connect("cleaner", "splitter")
indexing_pipeline.connect("splitter", "embedder")
indexing_pipeline.connect("embedder", "writer")
# 索引文档
indexing_pipeline.run({
"pdf_converter": {"sources": ["doc1.pdf", "doc2.pdf"]},
"text_converter": {"sources": ["doc3.txt"]}
})
基于FAQ文档的自动问答,支持语义理解和精确匹配
from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
from haystack.components.rankers import TransformersSimilarityRanker
# 构建FAQ搜索Pipeline
faq_pipeline = Pipeline()
# 双路检索
faq_pipeline.add_component("bm25", InMemoryBM25Retriever(document_store, top_k=20))
faq_pipeline.add_component("embedding", InMemoryEmbeddingRetriever(document_store, top_k=20))
# 合并
faq_pipeline.add_component("joiner", DocumentJoiner())
# 重排序(提升Top结果质量)
faq_pipeline.add_component("ranker", TransformersSimilarityRanker(
model="BAAI/bge-reranker-base",
top_k=3
))
# 生成友好回答
# template使用Jinja2语法(在实际代码中)
faq_pipeline.add_component("prompt", PromptBuilder(template="""
根据以下FAQ回答用户问题:
[使用for循环遍历documents]
用户问题: [query变量]
"""))
faq_pipeline.add_component("llm", OpenAIGenerator(model="gpt-3.5-turbo"))
# 连接
faq_pipeline.connect("bm25", "joiner")
faq_pipeline.connect("embedding", "joiner")
faq_pipeline.connect("joiner", "ranker.documents")
faq_pipeline.connect("ranker", "prompt.documents")
faq_pipeline.connect("prompt", "llm")
# 查询
result = faq_pipeline.run({
"bm25": {"query": "如何退款?"},
"embedding": {"query": "如何退款?"},
"ranker": {"query": "如何退款?"},
"prompt": {"query": "如何退款?"}
})
print(result["llm"]["replies"][0])
抓取网页内容,实时回答最新信息
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
# Web搜索Pipeline
web_pipeline = Pipeline()
# 1. 搜索API获取链接
pipeline.add_component("search", SerperDevWebSearch(api_key="your-key"))
# 2. 抓取网页内容
pipeline.add_component("fetcher", LinkContentFetcher())
# 3. HTML转文档
pipeline.add_component("converter", HTMLToDocument())
# 4. 分块
pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=3))
# 5. 生成答案
pipeline.add_component("prompt", PromptBuilder(template=web_qa_template))
pipeline.add_component("llm", OpenAIGenerator())
# 连接
pipeline.connect("search.links", "fetcher.urls")
pipeline.connect("fetcher", "converter")
pipeline.connect("converter", "splitter")
pipeline.connect("splitter", "prompt.documents")
pipeline.connect("prompt", "llm")
# 实时搜索
result = web_pipeline.run({
"search": {"query": "2024年AI最新进展"},
"prompt": {"query": "2024年AI最新进展"}
})
Haystack 2.0支持Agent和工具调用
from haystack.components.agents import OpenAIAgent
agent = OpenAIAgent(
tools=[search_tool, calculator_tool],
system_prompt="你是一个助手"
)
result = agent.run("搜索最新AI新闻并总结")
内置评估组件,量化系统性能
from haystack.components.evaluators import (
FaithfulnessEvaluator,
ContextRelevanceEvaluator
)
# 忠实度评估
faithfulness = FaithfulnessEvaluator()
result = faithfulness.run(
questions=["问题"],
contexts=[["上下文"]],
responses=["回答"]
)
# 相关性评估
relevance = ContextRelevanceEvaluator()
result = relevance.run(
questions=["问题"],
contexts=[["上下文"]]
)
处理文本、表格、图片
from haystack.components.converters import (
MarkdownToDocument,
TextFileToDocument
)
# 表格提取
from haystack.components.extractors import TableExtractor
pipeline.add_component("table_extractor", TableExtractor())
逐字展示生成过程
# 启用streaming
generator = OpenAIGenerator(
model="gpt-4",
streaming_callback=lambda chunk: print(chunk, end="")
)
result = pipeline.run({"query": "..."})
Haystack 2.x:全新设计,Pipeline API更清晰,组件化更好
Haystack 1.x:旧版本,部分功能更成熟
建议:新项目用2.x,旧项目可保持1.x
根据需求选择:
| 特性 | Haystack | LangChain | LlamaIndex |
|---|---|---|---|
| 核心定位 | 搜索+问答 | 通用LLM框架 | 数据索引 |
| 搜索能力 | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐⭐ |
| Agent支持 | ⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐ |
💡 适合大规模文档搜索、企业知识库,与Elasticsearch深度集成