removing unlink code

This commit is contained in:
blazickjp
2025-04-10 19:43:33 -07:00
parent 8f63ef2554
commit d99ff0f0dd

View File

@@ -22,6 +22,7 @@ conversion_statuses: Dict[str, Any] = {}
@dataclass
class ConversionStatus:
"""Track the status of a PDF to Markdown conversion."""
paper_id: str
status: str # 'downloading', 'converting', 'success', 'error'
started_at: datetime
@@ -37,16 +38,16 @@ download_tool = types.Tool(
"properties": {
"paper_id": {
"type": "string",
"description": "The arXiv ID of the paper to download"
"description": "The arXiv ID of the paper to download",
},
"check_status": {
"type": "boolean",
"description": "If true, only check conversion status without downloading",
"default": False
}
"default": False,
},
},
"required": ["paper_id"]
}
"required": ["paper_id"],
},
)
@@ -63,7 +64,7 @@ def convert_pdf_to_markdown(paper_id: str, pdf_path: Path) -> None:
logger.info(f"Starting conversion for {paper_id}")
markdown = pymupdf4llm.to_markdown(pdf_path, show_progress=False)
md_path = get_paper_path(paper_id, ".md")
with open(md_path, "w", encoding="utf-8") as f:
f.write(markdown)
@@ -71,11 +72,10 @@ def convert_pdf_to_markdown(paper_id: str, pdf_path: Path) -> None:
if status:
status.status = "success"
status.completed_at = datetime.now()
# Clean up PDF after successful conversion
pdf_path.unlink()
logger.info(f"Conversion completed for {paper_id}")
except Exception as e:
logger.error(f"Conversion failed for {paper_id}: {str(e)}")
status = conversion_statuses.get(paper_id)
@@ -90,108 +90,137 @@ async def handle_download(arguments: Dict[str, Any]) -> List[types.TextContent]:
try:
paper_id = arguments["paper_id"]
check_status = arguments.get("check_status", False)
# If only checking status
if check_status:
status = conversion_statuses.get(paper_id)
if not status:
if get_paper_path(paper_id, ".md").exists():
return [types.TextContent(
return [
types.TextContent(
type="text",
text=json.dumps(
{
"status": "success",
"message": "Paper is ready",
"resource_uri": f"file://{get_paper_path(paper_id, '.md')}",
}
),
)
]
return [
types.TextContent(
type="text",
text=json.dumps({
"status": "success",
"message": "Paper is ready",
"resource_uri": f"file://{get_paper_path(paper_id, '.md')}"
})
)]
return [types.TextContent(
text=json.dumps(
{
"status": "unknown",
"message": "No download or conversion in progress",
}
),
)
]
return [
types.TextContent(
type="text",
text=json.dumps({
"status": "unknown",
"message": "No download or conversion in progress"
})
)]
return [types.TextContent(
type="text",
text=json.dumps({
"status": status.status,
"started_at": status.started_at.isoformat(),
"completed_at": status.completed_at.isoformat() if status.completed_at else None,
"error": status.error,
"message": f"Paper conversion {status.status}"
})
)]
text=json.dumps(
{
"status": status.status,
"started_at": status.started_at.isoformat(),
"completed_at": (
status.completed_at.isoformat()
if status.completed_at
else None
),
"error": status.error,
"message": f"Paper conversion {status.status}",
}
),
)
]
# Check if paper is already converted
if get_paper_path(paper_id, ".md").exists():
return [types.TextContent(
type="text",
text=json.dumps({
"status": "success",
"message": "Paper already available",
"resource_uri": f"file://{get_paper_path(paper_id, '.md')}"
})
)]
return [
types.TextContent(
type="text",
text=json.dumps(
{
"status": "success",
"message": "Paper already available",
"resource_uri": f"file://{get_paper_path(paper_id, '.md')}",
}
),
)
]
# Check if already in progress
if paper_id in conversion_statuses:
status = conversion_statuses[paper_id]
return [types.TextContent(
type="text",
text=json.dumps({
"status": status.status,
"message": f"Paper conversion {status.status}",
"started_at": status.started_at.isoformat()
})
)]
return [
types.TextContent(
type="text",
text=json.dumps(
{
"status": status.status,
"message": f"Paper conversion {status.status}",
"started_at": status.started_at.isoformat(),
}
),
)
]
# Start new download and conversion
pdf_path = get_paper_path(paper_id, ".pdf")
client = arxiv.Client()
# Initialize status
conversion_statuses[paper_id] = ConversionStatus(
paper_id=paper_id,
status="downloading",
started_at=datetime.now()
paper_id=paper_id, status="downloading", started_at=datetime.now()
)
# Download PDF
paper = next(client.results(arxiv.Search(id_list=[paper_id])))
paper.download_pdf(dirpath=pdf_path.parent, filename=pdf_path.name)
# Update status and start conversion
status = conversion_statuses[paper_id]
status.status = "converting"
# Start conversion in thread
asyncio.create_task(
asyncio.to_thread(convert_pdf_to_markdown, paper_id, pdf_path)
)
return [types.TextContent(
type="text",
text=json.dumps({
"status": "converting",
"message": "Paper downloaded, conversion started",
"started_at": status.started_at.isoformat()
})
)]
return [
types.TextContent(
type="text",
text=json.dumps(
{
"status": "converting",
"message": "Paper downloaded, conversion started",
"started_at": status.started_at.isoformat(),
}
),
)
]
except StopIteration:
return [types.TextContent(
type="text",
text=json.dumps({
"status": "error",
"message": f"Paper {paper_id} not found on arXiv"
})
)]
return [
types.TextContent(
type="text",
text=json.dumps(
{
"status": "error",
"message": f"Paper {paper_id} not found on arXiv",
}
),
)
]
except Exception as e:
return [types.TextContent(
type="text",
text=json.dumps({
"status": "error",
"message": f"Error: {str(e)}"
})
)]
return [
types.TextContent(
type="text",
text=json.dumps({"status": "error", "message": f"Error: {str(e)}"}),
)
]