Skip to content

Commit 06b530c

Browse files
author
jravenel
committed
feat: Migrated ArXiv agent components to new modular stucture
1 parent 0b13283 commit 06b530c

File tree

6 files changed

+450
-25
lines changed

6 files changed

+450
-25
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ chat-powerpoint-agent: .venv
105105
@ docker compose run abi bash -c 'poetry install && poetry run python -m src.core.apps.terminal_agent.main generic_run_agent PowerPointAssistant'
106106

107107
chat-arxiv-agent: .venv
108-
@ docker compose run abi bash -c 'poetry install && poetry run chat-arxiv-agent'
108+
@ docker compose run abi bash -c 'poetry install && poetry run python -m src.core.apps.terminal_agent.main generic_run_agent ArXivAssistant'
109109

110110
.DEFAULT_GOAL := chat-supervisor-agent
111111

src/custom/arxiv-agent/assistants/ArXivAssistant.py src/custom/modules/arxiv_agent/assistants/ArXivAssistant.py

+25-7
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,35 @@
11
from langchain_openai import ChatOpenAI
22
from abi.services.agent.Agent import Agent, AgentConfiguration, AgentSharedState, MemorySaver
33
from src import secret, config
4-
from src.integrations.ArXivIntegration import ArXivIntegration, ArXivIntegrationConfiguration
5-
from src.pipelines.arxiv.ArXivPaperPipeline import ArXivPaperPipeline, ArXivPaperPipelineConfiguration
4+
from src.custom.modules.arxiv_agent.integrations.ArXivIntegration import ArXivIntegration, ArXivIntegrationConfiguration
5+
from src.custom.modules.arxiv_agent.pipelines.ArXivPaperPipeline import ArXivPaperPipeline, ArXivPaperPipelineConfiguration
66
from abi.services.ontology_store.adaptors.secondary.OntologyStoreService__SecondaryAdaptor__Filesystem import OntologyStoreService__SecondaryAdaptor__Filesystem
77
from abi.services.ontology_store.OntologyStoreService import OntologyStoreService
8+
from src.custom.modules.arxiv_agent.workflows.ArXivQueryWorkflow import ArXivQueryWorkflow, ArXivQueryWorkflowConfiguration
89

910
NAME = "ArXiv Assistant"
11+
SLUG = "arxiv-assistant"
1012
DESCRIPTION = "Search and analyze research papers from ArXiv"
13+
AVATAR_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/a/a8/ArXiv_web.svg/1200px-ArXiv_web.svg.png"
1114
SYSTEM_PROMPT = """You are an ArXiv research assistant. You can help users search for papers, get paper details, and analyze research trends.
1215
You have access to the following tools:
1316
- search_arxiv_papers: Search for papers on ArXiv
1417
- get_arxiv_paper: Get metadata for a specific paper
15-
- arxiv_paper_pipeline: Add papers to the knowledge graph
18+
- arxiv_paper_pipeline: Add papers to the knowledge graph and download PDFs
19+
- query_arxiv_authors: Find the authors of a paper in the knowledge graph
20+
- query_arxiv_papers: Find papers by author or category in the knowledge graph
21+
- execute_arxiv_query: Run a custom SPARQL query on the knowledge graph
1622
17-
When users ask about papers, first search for relevant papers using search_arxiv_papers. Then you can get detailed information about specific papers using get_arxiv_paper.
18-
Use arxiv_paper_pipeline to add important papers to the knowledge graph for future reference."""
23+
When users ask about papers, first search for relevant papers using search_arxiv_papers.
24+
Then you can get detailed information about specific papers using get_arxiv_paper.
25+
Use arxiv_paper_pipeline to add important papers to the knowledge graph for future reference.
26+
Use the query tools to search for information in papers you've already added to the knowledge graph."""
1927

2028
class ArXivAssistant(Agent):
2129
"""Assistant for interacting with ArXiv papers."""
2230
pass
2331

24-
def create_arxiv_agent(
32+
def create_agent(
2533
agent_shared_state: AgentSharedState = None,
2634
agent_configuration: AgentConfiguration = None
2735
) -> Agent:
@@ -58,11 +66,21 @@ def create_arxiv_agent(
5866
arxiv_pipeline = ArXivPaperPipeline(
5967
ArXivPaperPipelineConfiguration(
6068
arxiv_integration_config=arxiv_integration_config,
61-
ontology_store=ontology_store
69+
ontology_store=ontology_store,
70+
storage_base_path="storage/triplestore/application-level/arxiv",
71+
pdf_storage_path="datastore/application-level/arxiv"
6272
)
6373
)
6474
tools += arxiv_pipeline.as_tools()
6575

76+
# Add ArXiv query workflow
77+
arxiv_query_workflow = ArXivQueryWorkflow(
78+
ArXivQueryWorkflowConfiguration(
79+
storage_path="storage/triplestore/application-level/arxiv"
80+
)
81+
)
82+
tools += arxiv_query_workflow.as_tools()
83+
6684
# Use provided configuration or create default
6785
if agent_configuration is None:
6886
agent_configuration = AgentConfiguration(

src/custom/arxiv-agent/pipelines/ArXivPaperPipeline.py src/custom/modules/arxiv_agent/pipelines/ArXivPaperPipeline.py

+56-17
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
from dataclasses import dataclass
22
from datetime import datetime
3+
import os
4+
import uuid
5+
import re
6+
import requests
37
from rdflib import Graph
48
from pydantic import BaseModel, Field
59
from typing import Optional
@@ -8,19 +12,21 @@
812

913
from abi.pipeline import Pipeline, PipelineConfiguration
1014
from abi.utils.Graph import ABIGraph, ABI, BFO
11-
from src.integrations.ArXivIntegration import ArXivIntegration, ArXivIntegrationConfiguration
15+
from src.custom.modules.arxiv_agent.integrations.ArXivIntegration import ArXivIntegration, ArXivIntegrationConfiguration
1216
from abi.services.ontology_store.OntologyStorePorts import IOntologyStoreService
1317

1418
@dataclass
1519
class ArXivPaperPipelineConfiguration(PipelineConfiguration):
1620
"""Configuration for ArXivPaperPipeline."""
1721
arxiv_integration_config: ArXivIntegrationConfiguration
1822
ontology_store: IOntologyStoreService
19-
ontology_store_name: str = "arxiv"
23+
storage_base_path: str = "storage/triplestore/application-level/arxiv"
24+
pdf_storage_path: str = "datastore/application-level/arxiv"
2025

2126
class ArXivPaperPipelineParameters(BaseModel):
2227
"""Parameters for ArXivPaperPipeline."""
2328
paper_id: str = Field(..., description="ArXiv paper ID")
29+
download_pdf: bool = Field(True, description="Whether to download the paper's PDF")
2430

2531
class ArXivPaperPipeline(Pipeline):
2632
"""Pipeline for adding ArXiv papers to the ontology."""
@@ -29,20 +35,18 @@ def __init__(self, configuration: ArXivPaperPipelineConfiguration):
2935
super().__init__(configuration)
3036
self.__configuration = configuration
3137
self.__arxiv_integration = ArXivIntegration(configuration.arxiv_integration_config)
38+
39+
# Ensure storage directories exist
40+
os.makedirs(self.__configuration.storage_base_path, exist_ok=True)
41+
os.makedirs(self.__configuration.pdf_storage_path, exist_ok=True)
3242

3343
def run(self, parameters: ArXivPaperPipelineParameters) -> Graph:
3444
# Init graph
35-
try:
36-
existing_graph = self.__configuration.ontology_store.get(self.__configuration.ontology_store_name)
37-
graph = ABIGraph()
38-
for triple in existing_graph:
39-
graph.add(triple)
40-
except Exception:
41-
graph = ABIGraph()
45+
graph = ABIGraph()
4246

4347
# Get paper data
4448
paper_data = self.__arxiv_integration.get_paper(parameters.paper_id)
45-
49+
4650
# Add paper to graph
4751
paper = graph.add_individual_to_prefix(
4852
prefix=ABI,
@@ -86,25 +90,60 @@ def run(self, parameters: ArXivPaperPipelineParameters) -> Graph:
8690
)
8791
graph.add((paper, ABI.hasCategory, cat))
8892

89-
self.__configuration.ontology_store.store(self.__configuration.ontology_store_name, graph)
93+
# Generate a unique filename based on paper title and UUID
94+
# Clean the title to create a valid filename
95+
safe_title = re.sub(r'[^\w\s-]', '', paper_data["title"])
96+
safe_title = re.sub(r'[\s-]+', '_', safe_title).lower()
97+
safe_title = safe_title[:50] # Limit length
98+
unique_id = str(uuid.uuid4())
99+
100+
# Store the TTL file
101+
ttl_filename = f"{safe_title}_{unique_id}.ttl"
102+
ttl_filepath = os.path.join(self.__configuration.storage_base_path, ttl_filename)
103+
104+
with open(ttl_filepath, 'wb') as f:
105+
f.write(graph.serialize(format="turtle").encode('utf-8'))
106+
107+
print(f"Paper metadata stored at: {ttl_filepath}")
108+
109+
# Download PDF if requested
110+
if parameters.download_pdf and paper_data["pdf_url"]:
111+
try:
112+
pdf_filename = f"{safe_title}_{unique_id}.pdf"
113+
pdf_filepath = os.path.join(self.__configuration.pdf_storage_path, pdf_filename)
114+
115+
# Add PDF file path to graph
116+
graph.add((paper, ABI.localFilePath, pdf_filepath))
117+
118+
response = requests.get(paper_data["pdf_url"], stream=True)
119+
response.raise_for_status()
120+
121+
with open(pdf_filepath, 'wb') as pdf_file:
122+
for chunk in response.iter_content(chunk_size=8192):
123+
pdf_file.write(chunk)
124+
125+
print(f"PDF downloaded to: {pdf_filepath}")
126+
127+
# Update the TTL file to include the PDF file path
128+
with open(ttl_filepath, 'wb') as f:
129+
f.write(graph.serialize(format="turtle").encode('utf-8'))
130+
except Exception as e:
131+
print(f"Error downloading PDF: {e}")
132+
90133
return graph
91134

92135
def as_tools(self) -> list[StructuredTool]:
93136
return [
94137
StructuredTool(
95138
name="arxiv_paper_pipeline",
96-
description="Adds an ArXiv paper to the ontology",
139+
description="Adds an ArXiv paper to the ontology and optionally downloads the PDF",
97140
func=lambda **kwargs: self.run(ArXivPaperPipelineParameters(**kwargs)),
98141
args_schema=ArXivPaperPipelineParameters
99142
)
100143
]
101144

102145
def as_api(self, router: APIRouter) -> None:
103-
"""Adds API endpoints for this pipeline to the given router.
104-
105-
Args:
106-
router (APIRouter): FastAPI router to add endpoints to
107-
"""
146+
"""Adds API endpoints for this pipeline to the given router."""
108147
@router.post("/arxiv/paper")
109148
def run(parameters: ArXivPaperPipelineParameters):
110149
return self.run(parameters).serialize(format="turtle")

0 commit comments

Comments
 (0)