Merge pull request #11 from robertjakob/feature/pdf-processing-improvements

Restore V2 and V3 directories
This commit is contained in:
Robert Jakob
2025-05-05 20:16:01 +02:00
committed by GitHub
17 changed files with 1259 additions and 0 deletions

View File

@@ -0,0 +1,42 @@
# Environment variables
.env
# Manuscripts
manuscripts/
analysis_results/
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# Virtual Environment
venv/
ENV/
# IDE
.idea/
.vscode/
*.swp
*.swo
# OS
.DS_Store
Thumbs.db

View File

@@ -0,0 +1,78 @@
# V2 - Editorial First Decision Support
A tool that analyzes academic manuscripts against editorial requirements using OpenAI's GPT models.
## Features
- Extracts text from PDF manuscripts
- Analyzes manuscript against a list of editorial requirements
- Identifies which requirements are met and which are not
- Provides specific evidence for unmet requirements
- Generates a desk rejection recommendation
- Processes multiple PDFs in batch
## Setup
1. Clone this repository
2. Install dependencies:
```bash
pip install -r requirements.txt
```
3. Set up your OpenAI API key:
- Either set it as an environment variable: `export OPENAI_API_KEY=your-key-here`
- Or provide it via command line argument: `--api-key your-key-here`
## Usage
1. Place your PDF manuscripts in the `manuscripts/` directory
2. Create a text file with your editorial requirements (one per line)
3. Run the checker:
```bash
python src/main.py --requirements path/to/requirements.txt
```
Optional arguments:
- `--manuscripts-dir`: Directory containing PDFs (default: manuscripts)
- `--output-dir`: Directory for analysis results (default: analysis_results)
- `--api-key`: Your OpenAI API key
## Project Structure
```
.
├── manuscripts/ # Directory for PDF manuscripts
├── analysis_results/ # Directory for analysis output files
├── src/ # Source code
│ ├── main.py
│ ├── pdf_parser.py
│ ├── openai_client.py
│ └── requirements_checker.py
├── requirements.txt # Python dependencies
└── example_requirements.txt # Example requirements file
```
## Example Requirements File
```
Manuscript must be under 5000 words
Abstract must be structured (Background, Methods, Results, Conclusion)
Figures must be in high resolution (300 DPI minimum)
```
## Output
For each PDF in the manuscripts directory, the tool will:
1. Create a separate analysis file in the `analysis_results/` directory
2. Name the file `{manuscript_name}_analysis.txt`
3. Include:
- Analysis of each requirement (met/not met)
- Evidence for unmet requirements
- Final desk rejection recommendation with justification
## Development
The project structure is modular and easy to extend:
- `pdf_parser.py`: Handles PDF text extraction
- `openai_client.py`: Manages OpenAI API interactions
- `requirements_checker.py`: Orchestrates the analysis process
- `main.py`: Provides the CLI interface

View File

@@ -0,0 +1,4 @@
openai>=1.0.0
PyMuPDF>=1.23.0
python-dotenv>=1.0.0
pytest>=7.0.0

View File

@@ -0,0 +1,15 @@
Manuscript must be under 5000 words
Abstract must be structured with Background, Methods, Results, and Conclusion sections
All figures must be in high resolution (600 DPI minimum) with detailed captions
Methods section must include comprehensive statistical analysis procedures
Results must be presented with appropriate statistical tests and p-values
References must follow APA format
All abbreviations must be defined at first use with a list of abbreviations provided
Conflict of interest statement must be included
Ethics approval must be mentioned if human subjects were involved
Data availability statement must be included
Funding sources must be acknowledged
Author contributions must be specified
Limitations of the study must be discussed
Future research directions must be suggested
Key findings must be summarized in a conclusion section

View File

@@ -0,0 +1,15 @@
Manuscript must be under 10000 words
Abstract should include key information about the study
Figures should be clear and readable
Methods section should describe the approach used
Results should be clearly presented
References should be consistent in format
Abbreviations should be explained where used
Conflict of interest statement is optional
Ethics approval should be mentioned if required by local regulations
Data availability statement is optional
Funding information can be included if relevant
Author contributions can be mentioned if desired
Study limitations can be discussed if relevant
Future work can be suggested if appropriate
A conclusion section is recommended

View File

@@ -0,0 +1,110 @@
import argparse
import json
import os
from typing import List
from requirements_checker import RequirementsChecker
def read_requirements(requirements_path: str) -> List[str]:
"""
Read requirements from a text file.
Args:
requirements_path (str): Path to the requirements file
Returns:
List[str]: List of requirements
"""
with open(requirements_path, 'r') as f:
return [line.strip() for line in f if line.strip()]
def get_pdf_files(directory: str) -> List[str]:
"""
Get all PDF files from a directory.
Args:
directory (str): Path to the directory
Returns:
List[str]: List of PDF file paths
"""
pdf_files = []
for file in os.listdir(directory):
if file.lower().endswith('.pdf'):
pdf_files.append(os.path.join(directory, file))
return pdf_files
def analyze_manuscript(checker: RequirementsChecker, pdf_path: str, requirements: List[str], output_dir: str) -> None:
"""
Analyze a single manuscript and save results to a file.
Args:
checker (RequirementsChecker): The requirements checker instance
pdf_path (str): Path to the PDF file
requirements (List[str]): List of requirements to check
output_dir (str): Directory to save the results
"""
try:
# Get the base filename without extension
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
# Analyze manuscript
results = checker.check_manuscript(pdf_path, requirements)
# Format results
formatted_results = checker.format_results(results)
# Save results to file
output_file = os.path.join(output_dir, f"{base_name}_analysis.txt")
with open(output_file, 'w') as f:
f.write(formatted_results)
print(f"Analysis completed for {base_name}")
print(f"Results saved to: {output_file}\n")
except Exception as e:
print(f"Error processing {pdf_path}: {str(e)}\n")
def main():
parser = argparse.ArgumentParser(description='Manuscript Requirements Checker')
parser.add_argument('--manuscripts-dir', default='manuscripts',
help='Directory containing PDF manuscripts (default: manuscripts)')
parser.add_argument('--requirements', required=True, help='Path to the requirements text file')
parser.add_argument('--output-dir', default='analysis_results',
help='Directory to save analysis results (default: analysis_results)')
parser.add_argument('--api-key', help='OpenAI API key (optional if set in environment)')
args = parser.parse_args()
try:
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
# Read requirements
requirements = read_requirements(args.requirements)
# Get PDF files
pdf_files = get_pdf_files(args.manuscripts_dir)
if not pdf_files:
print(f"No PDF files found in {args.manuscripts_dir}")
return 1
print(f"Found {len(pdf_files)} PDF files to analyze")
# Initialize checker
checker = RequirementsChecker(api_key=args.api_key)
# Process each PDF
for pdf_path in pdf_files:
analyze_manuscript(checker, pdf_path, requirements, args.output_dir)
print("Analysis complete!")
except Exception as e:
print(f"Error: {str(e)}")
return 1
return 0
if __name__ == '__main__':
exit(main())

View File

@@ -0,0 +1,134 @@
import os
import json
from typing import List, Dict, Any
from openai import OpenAI
from dotenv import load_dotenv
class OpenAIClient:
"""A class to handle interactions with the OpenAI API."""
def __init__(self, api_key: str = None):
"""
Initialize the OpenAI client.
Args:
api_key (str, optional): OpenAI API key. If not provided, will try to load from environment.
"""
# Try to load .env from the current directory
load_dotenv()
# If API key is not found, try to load from parent directory
if not os.getenv("OPENAI_API_KEY"):
# Get the path to the parent directory (two levels up from this file)
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
env_path = os.path.join(parent_dir, ".env")
if os.path.exists(env_path):
load_dotenv(env_path)
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
if not self.api_key:
raise ValueError("OpenAI API key is required")
self.client = OpenAI(api_key=self.api_key)
def check_requirements(self, manuscript_text: str, requirements: List[str]) -> Dict[str, Any]:
"""
Check if the manuscript meets the given requirements using GPT-3.5-turbo.
Args:
manuscript_text (str): The full text of the manuscript
requirements (List[str]): List of editorial requirements to check
Returns:
Dict[str, Any]: Analysis results including requirement status and evidence
"""
# Truncate manuscript text to first 4000 words to reduce token usage
words = manuscript_text.split()
truncated_text = ' '.join(words[:4000]) if len(words) > 4000 else manuscript_text
prompt = self._create_analysis_prompt(truncated_text, requirements)
try:
response = self.client.chat.completions.create(
model="gpt-3.5-turbo", # Using standard model instead of 16k for cost efficiency
messages=[
{"role": "system", "content": "You are an expert manuscript reviewer. Analyze manuscripts against requirements. Be strict and thorough. Only mark requirements as met with clear evidence. Provide specific quotes and exact numbers when applicable. Always respond with valid JSON."},
{"role": "user", "content": prompt}
],
temperature=0.3,
max_tokens=1000 # Limit response length
)
response_content = response.choices[0].message.content
print(f"OpenAI Response: {response_content}") # Debug print
return self._parse_response(response_content)
except Exception as e:
raise Exception(f"Failed to analyze manuscript: {str(e)}")
def _create_analysis_prompt(self, manuscript_text: str, requirements: List[str]) -> str:
"""
Create a prompt for the requirements analysis.
Args:
manuscript_text (str): The manuscript text
requirements (List[str]): List of requirements to check
Returns:
str: Formatted prompt for the OpenAI API
"""
requirements_section = "\n".join([f"{i+1}. {req}" for i, req in enumerate(requirements)])
return f"""Please analyze the following manuscript against these requirements:
{requirements_section}
For each requirement:
1. Determine if it is met (YES/NO)
2. Provide evidence from the text
3. Give a brief explanation
Manuscript text:
{manuscript_text}
Please format your response as a JSON object with the following structure:
{{
"requirements_analysis": [
{{
"requirement": "<requirement text>",
"is_met": <true/false>,
"evidence": "<specific evidence from the text>",
"explanation": "<brief explanation>"
}}
],
"desk_rejection_recommendation": {{
"should_reject": <true/false>,
"justification": "<detailed explanation of the recommendation>"
}}
}}"""
def _parse_response(self, response: str) -> Dict[str, Any]:
"""
Parse the OpenAI API response into a structured format.
Args:
response (str): Raw response from the API
Returns:
Dict[str, Any]: Parsed response
"""
try:
# Remove code block markers if present
cleaned_response = response.strip()
if cleaned_response.startswith("```"):
cleaned_response = cleaned_response.split("\n", 1)[1] # Remove first line
if cleaned_response.endswith("```"):
cleaned_response = cleaned_response.rsplit("\n", 1)[0] # Remove last line
if cleaned_response.startswith("json"):
cleaned_response = cleaned_response.split("\n", 1)[1] # Remove "json" line
return json.loads(cleaned_response)
except json.JSONDecodeError as e:
print(f"Failed to parse JSON: {str(e)}") # Debug print
print(f"Response content: {response}") # Debug print
raise Exception("Failed to parse OpenAI response as JSON")

View File

@@ -0,0 +1,175 @@
import fitz # PyMuPDF
import re
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass
@dataclass
class TextBlock:
"""Represents a block of text with its properties."""
text: str
page: int
font_size: float
font_name: str
is_bold: bool
is_italic: bool
bbox: Tuple[float, float, float, float]
class PDFParser:
"""A class to parse PDF manuscripts with advanced text extraction capabilities."""
def __init__(self, pdf_path: str):
"""
Initialize the PDF parser.
Args:
pdf_path (str): Path to the PDF file
"""
self.pdf_path = pdf_path
self.doc = None
self.text_blocks = []
def extract_text(self) -> str:
"""
Extract text from the PDF with structure preservation.
Returns:
str: Extracted and structured text
"""
try:
self.doc = fitz.open(self.pdf_path)
self.text_blocks = []
# Process first 10 pages or less
max_pages = min(10, len(self.doc))
for page_num in range(max_pages):
page = self.doc[page_num]
blocks = self._extract_page_blocks(page, page_num)
self.text_blocks.extend(blocks)
# Sort blocks by position and process
self.text_blocks.sort(key=lambda b: (b.page, b.bbox[1], b.bbox[0]))
# Combine blocks into structured text
structured_text = self._combine_blocks()
return structured_text
except Exception as e:
raise Exception(f"Failed to extract text from PDF: {str(e)}")
finally:
if self.doc:
self.doc.close()
def _extract_page_blocks(self, page: fitz.Page, page_num: int) -> List[TextBlock]:
"""
Extract text blocks from a page with formatting information.
Args:
page (fitz.Page): PDF page
page_num (int): Page number
Returns:
List[TextBlock]: List of text blocks with formatting
"""
blocks = []
# Get text with formatting information
text_dict = page.get_text("dict")
for block in text_dict.get("blocks", []):
for line in block.get("lines", []):
for span in line.get("spans", []):
# Extract text and formatting
text = span.get("text", "").strip()
if not text:
continue
font = span.get("font", "")
size = span.get("size", 0)
bbox = span.get("bbox", [0, 0, 0, 0])
# Check for bold/italic
flags = span.get("flags", 0)
is_bold = bool(flags & 2**1) # Check bold flag
is_italic = bool(flags & 2**0) # Check italic flag
blocks.append(TextBlock(
text=text,
page=page_num + 1,
font_size=size,
font_name=font,
is_bold=is_bold,
is_italic=is_italic,
bbox=tuple(bbox)
))
return blocks
def _combine_blocks(self) -> str:
"""
Combine text blocks into structured text.
Returns:
str: Structured text
"""
structured_text = []
current_section = None
for block in self.text_blocks:
text = block.text
# Detect headers based on font size and style
if block.font_size > 12 and block.is_bold:
if current_section:
structured_text.append("\n")
current_section = text
structured_text.append(f"\n{text}\n")
else:
# Regular text
structured_text.append(text)
# Add space between paragraphs
if text.endswith(('.', '!', '?')):
structured_text.append("\n")
return " ".join(structured_text)
def get_word_count(self, text: str) -> int:
"""
Get the word count of the text.
Args:
text (str): Text to count words in
Returns:
int: Word count
"""
return len(text.split())
def detect_sections(self) -> Dict[str, List[str]]:
"""
Detect major sections in the document.
Returns:
Dict[str, List[str]]: Dictionary of sections and their content
"""
sections = {}
current_section = "Introduction"
current_content = []
for block in self.text_blocks:
# Detect section headers
if block.font_size > 12 and block.is_bold:
if current_content:
sections[current_section] = current_content
current_section = block.text
current_content = []
else:
current_content.append(block.text)
# Add the last section
if current_content:
sections[current_section] = current_content
return sections

View File

@@ -0,0 +1,80 @@
from typing import List, Dict, Any
from pdf_parser import PDFParser
from openai_client import OpenAIClient
class RequirementsChecker:
"""A class to check manuscript requirements using OpenAI's GPT model."""
def __init__(self, api_key: str = None):
"""
Initialize the requirements checker.
Args:
api_key (str, optional): OpenAI API key
"""
self.openai_client = OpenAIClient(api_key)
def check_manuscript(self, pdf_path: str, requirements: List[str]) -> Dict[str, Any]:
"""
Check if a manuscript meets the given requirements.
Args:
pdf_path (str): Path to the PDF manuscript
requirements (List[str]): List of requirements to check
Returns:
Dict[str, Any]: Analysis results
"""
# Parse PDF with structure preservation
pdf_parser = PDFParser(pdf_path)
manuscript_text = pdf_parser.extract_text()
# Get sections for better context
sections = pdf_parser.detect_sections()
# Calculate word count
word_count = len(manuscript_text.split())
# Add metadata and section information to the text
structured_text = f"""Document Metadata:
Word Count: {word_count} words
Document Structure:
"""
for section, content in sections.items():
section_text = ' '.join(content)
section_word_count = len(section_text.split())
structured_text += f"\n{section} ({section_word_count} words):\n{section_text}\n"
# Check requirements using OpenAI
analysis = self.openai_client.check_requirements(structured_text, requirements)
return analysis
def format_results(self, results: Dict[str, Any]) -> str:
"""
Format the analysis results into a readable string.
Args:
results (Dict[str, Any]): Analysis results from OpenAI
Returns:
str: Formatted results
"""
output = []
output.append("=== Manuscript Requirements Analysis ===\n")
# Format requirements analysis
for req_analysis in results["requirements_analysis"]:
output.append(f"Requirement: {req_analysis['requirement']}")
output.append(f"Status: {'✓ Met' if req_analysis['is_met'] else '✗ Not Met'}")
output.append(f"Evidence: {req_analysis['evidence']}")
output.append(f"Explanation: {req_analysis['explanation']}\n")
# Format desk rejection recommendation
rejection = results["desk_rejection_recommendation"]
output.append("=== Final Recommendation ===")
output.append(f"Desk Rejection: {'Yes' if rejection['should_reject'] else 'No'}")
output.append(f"Justification: {rejection['justification']}")
return "\n".join(output)

View File

@@ -0,0 +1,38 @@
import os
from dotenv import load_dotenv
from openai import OpenAI
def test_api_key():
# Force reload of environment variables
load_dotenv(override=True)
# Get API key
api_key = os.getenv("OPENAI_API_KEY")
print(f"API Key loaded: {'Yes' if api_key else 'No'}")
if api_key:
print(f"API Key starts with: {api_key[:7]}...")
print(f"API Key length: {len(api_key)}")
# Print current working directory and .env file location
print(f"\nCurrent working directory: {os.getcwd()}")
env_path = os.path.join(os.getcwd(), '.env')
print(f"Looking for .env file at: {env_path}")
print(f".env file exists: {os.path.exists(env_path)}")
try:
# Initialize OpenAI client
client = OpenAI(api_key=api_key)
# Make a simple API call
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello!"}],
max_tokens=5
)
print("\nAPI call successful!")
print(f"Response: {response.choices[0].message.content}")
except Exception as e:
print(f"\nError: {str(e)}")
if __name__ == "__main__":
test_api_key()

81
V3_Peer_Review/README.md Normal file
View File

@@ -0,0 +1,81 @@
# Academic Manuscript Peer Review Tool
This tool uses OpenAI's GPT-4 to perform automated peer reviews of academic manuscripts. It analyzes PDF manuscripts against a set of review criteria and provides detailed feedback, scores, and recommendations.
## Features
- Automated peer review of academic manuscripts
- Comprehensive analysis across multiple review criteria
- Detailed feedback with specific examples and suggestions
- Metadata extraction and document structure analysis
- Support for multiple PDF files
- Configurable review criteria
## Installation
1. Clone the repository
2. Install the required dependencies:
```bash
pip install -r requirements.txt
```
3. Create a `.env` file in the root directory with your OpenAI API key:
```
OPENAI_API_KEY=your_api_key_here
```
Note: The tool will look for the .env file in the current directory first, then in the parent directory.
## Usage
1. Place your PDF manuscripts in the `manuscripts` directory
2. (Optional) Customize the review criteria in `review_criteria.json`
3. Run the review tool:
```bash
python src/main.py --criteria review_criteria.json
```
### Command Line Arguments
- `--manuscripts-dir`: Directory containing PDF manuscripts (default: `manuscripts`)
- `--criteria`: Path to the review criteria JSON file (required)
- `--output-dir`: Directory to save review results (default: `analysis_results`)
- `--api-key`: OpenAI API key (optional if set in environment)
## Review Criteria
The tool evaluates manuscripts against the following criteria:
1. Originality and Innovation
2. Methodology
3. Results and Analysis
4. Writing and Presentation
5. Technical Accuracy
6. Literature Review
7. Figures and Tables
8. References
9. Ethical Considerations
10. Impact and Significance
Each criterion is scored on a scale of 1-5, with detailed feedback and specific examples provided.
## Output Format
The review results are saved in text files with the following sections:
- Manuscript Metadata
- Document Statistics
- Overall Assessment
- Detailed Assessment (per criterion)
- Score
- Feedback
- Examples
- Suggestions for Improvement
## Requirements
- Python 3.7+
- OpenAI API key
- PDF manuscripts to review
## License
This project is licensed under the MIT License - see the LICENSE file for details.

View File

@@ -0,0 +1,3 @@
openai>=1.0.0
python-dotenv>=0.19.0
PyPDF2>=3.0.0

View File

@@ -0,0 +1,12 @@
{
"Originality and Innovation": "Assess the novelty and originality of the research. Consider if the work makes a significant contribution to the field and introduces new ideas or approaches.",
"Methodology": "Evaluate the research design, methods, and procedures. Consider if they are appropriate, well-described, and rigorously implemented.",
"Results and Analysis": "Assess the presentation and analysis of results. Consider if the data is properly analyzed, interpreted, and presented in a clear and logical manner.",
"Writing and Presentation": "Evaluate the clarity, organization, and quality of the writing. Consider if the manuscript is well-structured, easy to follow, and free of major language issues.",
"Technical Accuracy": "Assess the technical accuracy of the content, including mathematical derivations, statistical analyses, and experimental procedures.",
"Literature Review": "Evaluate the comprehensiveness and relevance of the literature review. Consider if it adequately covers the field and provides proper context for the research.",
"Figures and Tables": "Assess the quality and appropriateness of figures and tables. Consider if they are clear, well-labeled, and effectively support the text.",
"References": "Evaluate the completeness and accuracy of references. Consider if they are properly formatted and relevant to the research.",
"Ethical Considerations": "Assess if the research adheres to ethical standards and guidelines. Consider issues such as informed consent, data privacy, and conflict of interest.",
"Impact and Significance": "Evaluate the potential impact and significance of the research. Consider if it addresses an important question and has the potential to influence the field."
}

110
V3_Peer_Review/src/main.py Normal file
View File

@@ -0,0 +1,110 @@
import argparse
import json
import os
from typing import Dict, List
from peer_review_checker import PeerReviewChecker
def read_review_criteria(criteria_path: str) -> Dict[str, str]:
"""
Read review criteria from a JSON file.
Args:
criteria_path (str): Path to the criteria file
Returns:
Dict[str, str]: Dictionary of criteria and their descriptions
"""
with open(criteria_path, 'r') as f:
return json.load(f)
def get_pdf_files(directory: str) -> List[str]:
"""
Get all PDF files from a directory.
Args:
directory (str): Path to the directory
Returns:
List[str]: List of PDF file paths
"""
pdf_files = []
for file in os.listdir(directory):
if file.lower().endswith('.pdf'):
pdf_files.append(os.path.join(directory, file))
return pdf_files
def review_manuscript(checker: PeerReviewChecker, pdf_path: str, criteria: Dict[str, str], output_dir: str) -> None:
"""
Review a single manuscript and save results to a file.
Args:
checker (PeerReviewChecker): The peer review checker instance
pdf_path (str): Path to the PDF file
criteria (Dict[str, str]): Review criteria
output_dir (str): Directory to save the results
"""
try:
# Get the base filename without extension
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
# Review manuscript
results = checker.review_manuscript(pdf_path, criteria)
# Format results
formatted_results = checker.format_results(results)
# Save results to file
output_file = os.path.join(output_dir, f"{base_name}_review.txt")
with open(output_file, 'w') as f:
f.write(formatted_results)
print(f"Review completed for {base_name}")
print(f"Results saved to: {output_file}\n")
except Exception as e:
print(f"Error processing {pdf_path}: {str(e)}\n")
def main():
parser = argparse.ArgumentParser(description='Academic Manuscript Peer Review Tool')
parser.add_argument('--manuscripts-dir', default='manuscripts',
help='Directory containing PDF manuscripts (default: manuscripts)')
parser.add_argument('--criteria', required=True, help='Path to the review criteria JSON file')
parser.add_argument('--output-dir', default='analysis_results',
help='Directory to save review results (default: analysis_results)')
parser.add_argument('--api-key', help='OpenAI API key (optional if set in environment)')
args = parser.parse_args()
try:
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
# Read review criteria
criteria = read_review_criteria(args.criteria)
# Get PDF files
pdf_files = get_pdf_files(args.manuscripts_dir)
if not pdf_files:
print(f"No PDF files found in {args.manuscripts_dir}")
return 1
print(f"Found {len(pdf_files)} PDF files to review")
# Initialize checker
checker = PeerReviewChecker(api_key=args.api_key)
# Process each PDF
for pdf_path in pdf_files:
review_manuscript(checker, pdf_path, criteria, args.output_dir)
print("Review process complete!")
except Exception as e:
print(f"Error: {str(e)}")
return 1
return 0
if __name__ == '__main__':
exit(main())

View File

@@ -0,0 +1,124 @@
import os
import json
from typing import List, Dict, Any
from openai import OpenAI
from dotenv import load_dotenv
class OpenAIClient:
"""A class to handle interactions with the OpenAI API for peer review."""
def __init__(self, api_key: str = None):
"""
Initialize the OpenAI client.
Args:
api_key (str, optional): OpenAI API key. If not provided, will try to load from environment.
"""
# Try to load .env from the current directory
load_dotenv()
# If API key is not found, try to load from parent directory
if not os.getenv("OPENAI_API_KEY"):
# Get the path to the parent directory (two levels up from this file)
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
env_path = os.path.join(parent_dir, ".env")
if os.path.exists(env_path):
load_dotenv(env_path)
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
if not self.api_key:
raise ValueError("OpenAI API key is required")
self.client = OpenAI(api_key=self.api_key)
def analyze_manuscript(self, manuscript_text: str, review_criteria: Dict[str, str]) -> Dict[str, Any]:
"""
Analyze a manuscript using GPT-4 for comprehensive peer review.
Args:
manuscript_text (str): The full text of the manuscript
review_criteria (Dict[str, str]): Dictionary of review criteria and their descriptions
Returns:
Dict[str, Any]: Analysis results including scores and detailed feedback
"""
# Truncate manuscript text to first 4000 words to reduce token usage
words = manuscript_text.split()
truncated_text = ' '.join(words[:4000]) if len(words) > 4000 else manuscript_text
prompt = self._create_review_prompt(truncated_text, review_criteria)
try:
response = self.client.chat.completions.create(
model="gpt-4", # Using GPT-4 for more sophisticated analysis
messages=[
{"role": "system", "content": "You are an expert peer reviewer with extensive experience in academic publishing. Analyze manuscripts thoroughly and provide detailed, constructive feedback. Be objective and evidence-based in your assessment."},
{"role": "user", "content": prompt}
],
temperature=0.3,
max_tokens=2000 # Increased token limit for detailed feedback
)
return self._parse_response(response.choices[0].message.content)
except Exception as e:
raise Exception(f"Failed to analyze manuscript: {str(e)}")
def _create_review_prompt(self, manuscript_text: str, review_criteria: Dict[str, str]) -> str:
"""
Create a prompt for the peer review analysis.
Args:
manuscript_text (str): The manuscript text
review_criteria (Dict[str, str]): Review criteria and descriptions
Returns:
str: Formatted prompt for the OpenAI API
"""
criteria_section = "\n".join([f"- {criterion}: {description}"
for criterion, description in review_criteria.items()])
return f"""Please analyze the following manuscript according to these criteria:
{criteria_section}
For each criterion:
1. Provide a score from 1-5 (1 being lowest, 5 being highest)
2. Give detailed, constructive feedback
3. Support your assessment with specific examples from the text
4. Suggest specific improvements where applicable
Manuscript text:
{manuscript_text}
Please format your response as a JSON object with the following structure:
{{
"overall_assessment": {{
"score": <1-5>,
"summary": "<brief summary of overall assessment>"
}},
"criteria_assessments": {{
"<criterion_name>": {{
"score": <1-5>,
"feedback": "<detailed feedback>",
"examples": ["<specific example 1>", "<specific example 2>"],
"suggestions": ["<improvement suggestion 1>", "<improvement suggestion 2>"]
}}
}},
"recommendation": "<accept/revise/reject>",
"confidence": <0-1>
}}"""
def _parse_response(self, response: str) -> Dict[str, Any]:
"""
Parse the OpenAI API response into a structured format.
Args:
response (str): Raw response from the API
Returns:
Dict[str, Any]: Parsed response
"""
try:
return json.loads(response)
except json.JSONDecodeError:
raise Exception("Failed to parse OpenAI response as JSON")

View File

@@ -0,0 +1,117 @@
import os
import re
from typing import Dict, List, Tuple
import PyPDF2
class PDFParser:
"""A class to parse PDF manuscripts and extract structured content."""
def __init__(self, pdf_path: str):
"""
Initialize the PDF parser.
Args:
pdf_path (str): Path to the PDF file
"""
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
self.pdf_path = pdf_path
def extract_text(self) -> str:
"""
Extract text from the PDF file.
Returns:
str: Extracted text
"""
try:
with open(self.pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text.strip()
except Exception as e:
raise Exception(f"Failed to extract text from PDF: {str(e)}")
def detect_sections(self) -> Dict[str, List[str]]:
"""
Detect and extract sections from the manuscript.
Returns:
Dict[str, List[str]]: Dictionary of section names and their content
"""
text = self.extract_text()
# Common section headers in academic papers
section_patterns = {
'Abstract': r'Abstract[\s\S]*?(?=\n\n|\n[A-Z][a-z]+:)',
'Introduction': r'Introduction[\s\S]*?(?=\n\n|\n[A-Z][a-z]+:)',
'Methods': r'(Methods|Methodology|Materials and Methods)[\s\S]*?(?=\n\n|\n[A-Z][a-z]+:)',
'Results': r'Results[\s\S]*?(?=\n\n|\n[A-Z][a-z]+:)',
'Discussion': r'Discussion[\s\S]*?(?=\n\n|\n[A-Z][a-z]+:)',
'Conclusion': r'(Conclusion|Conclusions)[\s\S]*?(?=\n\n|\n[A-Z][a-z]+:)',
'References': r'(References|Bibliography)[\s\S]*?(?=\n\n|\n[A-Z][a-z]+:|$)'
}
sections = {}
for section_name, pattern in section_patterns.items():
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
section_text = match.group(0).strip()
# Clean up the section text
section_text = re.sub(r'^\w+\s*', '', section_text) # Remove section header
sections[section_name] = section_text.split('\n')
return sections
def get_metadata(self) -> Dict[str, str]:
"""
Extract metadata from the PDF.
Returns:
Dict[str, str]: Dictionary of metadata
"""
try:
with open(self.pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
metadata = reader.metadata
return {
'title': metadata.get('/Title', 'Unknown'),
'author': metadata.get('/Author', 'Unknown'),
'creation_date': metadata.get('/CreationDate', 'Unknown'),
'page_count': str(len(reader.pages))
}
except Exception as e:
raise Exception(f"Failed to extract metadata from PDF: {str(e)}")
def get_references(self) -> List[str]:
"""
Extract references from the manuscript.
Returns:
List[str]: List of references
"""
sections = self.detect_sections()
if 'References' in sections:
return sections['References']
return []
def get_figures_and_tables(self) -> Tuple[List[str], List[str]]:
"""
Extract figures and tables from the manuscript.
Returns:
Tuple[List[str], List[str]]: Lists of figures and tables
"""
text = self.extract_text()
# Simple pattern matching for figures and tables
figure_pattern = r'Figure \d+[.:].*?(?=\n\n|\n[A-Z][a-z]+:)'
table_pattern = r'Table \d+[.:].*?(?=\n\n|\n[A-Z][a-z]+:)'
figures = re.findall(figure_pattern, text, re.IGNORECASE | re.DOTALL)
tables = re.findall(table_pattern, text, re.IGNORECASE | re.DOTALL)
return figures, tables

View File

@@ -0,0 +1,121 @@
from typing import Dict, Any, List
from pdf_parser import PDFParser
from openai_client import OpenAIClient
class PeerReviewChecker:
"""A class to coordinate the peer review process."""
def __init__(self, api_key: str = None):
"""
Initialize the peer review checker.
Args:
api_key (str, optional): OpenAI API key
"""
self.openai_client = OpenAIClient(api_key)
def review_manuscript(self, pdf_path: str, review_criteria: Dict[str, str]) -> Dict[str, Any]:
"""
Review a manuscript using the specified criteria.
Args:
pdf_path (str): Path to the PDF manuscript
review_criteria (Dict[str, str]): Dictionary of review criteria and their descriptions
Returns:
Dict[str, Any]: Review results
"""
# Parse PDF
pdf_parser = PDFParser(pdf_path)
# Get manuscript metadata
metadata = pdf_parser.get_metadata()
# Extract text and structure
manuscript_text = pdf_parser.extract_text()
sections = pdf_parser.detect_sections()
# Get references and figures/tables
references = pdf_parser.get_references()
figures, tables = pdf_parser.get_figures_and_tables()
# Add metadata and structure information to the text
structured_text = f"""Document Metadata:
Title: {metadata['title']}
Author: {metadata['author']}
Pages: {metadata['page_count']}
Creation Date: {metadata['creation_date']}
Document Structure:
"""
for section, content in sections.items():
section_text = ' '.join(content)
section_word_count = len(section_text.split())
structured_text += f"\n{section} ({section_word_count} words):\n{section_text}\n"
# Add references and figures/tables information
structured_text += f"\nReferences ({len(references)}):\n" + "\n".join(references)
structured_text += f"\n\nFigures ({len(figures)}):\n" + "\n".join(figures)
structured_text += f"\n\nTables ({len(tables)}):\n" + "\n".join(tables)
# Analyze manuscript using OpenAI
analysis = self.openai_client.analyze_manuscript(structured_text, review_criteria)
# Add metadata to the analysis results
analysis['metadata'] = metadata
analysis['statistics'] = {
'total_references': len(references),
'total_figures': len(figures),
'total_tables': len(tables),
'total_sections': len(sections)
}
return analysis
def format_results(self, results: Dict[str, Any]) -> str:
"""
Format the review results into a readable string.
Args:
results (Dict[str, Any]): Review results
Returns:
str: Formatted results
"""
output = []
# Add metadata section
output.append("=== Manuscript Metadata ===")
for key, value in results['metadata'].items():
output.append(f"{key}: {value}")
# Add statistics section
output.append("\n=== Document Statistics ===")
for key, value in results['statistics'].items():
output.append(f"{key}: {value}")
# Add overall assessment
output.append("\n=== Overall Assessment ===")
output.append(f"Score: {results['overall_assessment']['score']}/5")
output.append(f"Summary: {results['overall_assessment']['summary']}")
output.append(f"Recommendation: {results['recommendation']}")
output.append(f"Confidence: {results['confidence']*100:.1f}%")
# Add criteria assessments
output.append("\n=== Detailed Assessment ===")
for criterion, assessment in results['criteria_assessments'].items():
output.append(f"\n{criterion}")
output.append(f"Score: {assessment['score']}/5")
output.append(f"Feedback: {assessment['feedback']}")
if assessment['examples']:
output.append("\nExamples:")
for example in assessment['examples']:
output.append(f"- {example}")
if assessment['suggestions']:
output.append("\nSuggestions for Improvement:")
for suggestion in assessment['suggestions']:
output.append(f"- {suggestion}")
return "\n".join(output)