🤖 ML Model Training Guide for AI Research Oracle

Quick Start: Your First Prediction Model in 48h

Day 1: Collect Historical Data

Step 1: Download Papers from 2020-2021

import requests
import pandas as pd
from datetime import datetime, timedelta

def collect_historical_papers(start_date, end_date):
    papers = []
    current = start_date

    while current <= end_date:
        # ArXiv API call
        query = f"cat:cs.AI AND submittedDate:[{current} TO {current + timedelta(days=7)}]"
        response = requests.get(
            "http://export.arxiv.org/api/query",
            params={
                "search_query": query,
                "max_results": 1000,
                "sortBy": "submittedDate"
            }
        )

        # Parse and store
        papers.extend(parse_arxiv_response(response.text))
        current += timedelta(days=7)
        time.sleep(3)  # Rate limit

    return pd.DataFrame(papers)

# Collect 2 years of data
papers_df = collect_historical_papers(
    datetime(2020, 1, 1),
    datetime(2021, 12, 31)
)

Step 2: Get Current Citations

def get_current_citations(arxiv_id):
    # Use Semantic Scholar API
    response = requests.get(
        f"https://api.semanticscholar.org/v1/paper/arXiv:{arxiv_id}",
        params={"fields": "citationCount,influentialCitationCount"}
    )
    if response.status_code == 200:
        data = response.json()
        return data.get('citationCount', 0)
    return 0

# Add citations to dataframe
papers_df['citations_current'] = papers_df['arxiv_id'].apply(get_current_citations)

# Calculate citations at different time points
papers_df['years_old'] = (datetime.now() - papers_df['published_date']).dt.days / 365
papers_df['citations_per_year'] = papers_df['citations_current'] / papers_df['years_old']

Day 2: Extract Early Signals Features

Step 3: Author Features

def extract_author_features(authors_list):
    features = {
        'author_count': len(authors_list),
        'max_h_index': 0,
        'has_top_institution': False,
        'has_industry_author': False
    }

    top_unis = ['MIT', 'Stanford', 'CMU', 'Berkeley', 'Oxford', 'Cambridge']
    companies = ['Google', 'DeepMind', 'OpenAI', 'Meta', 'Microsoft']

    for author in authors_list:
        # Get h-index from Semantic Scholar
        h_index = get_author_h_index(author['name'])
        features['max_h_index'] = max(features['max_h_index'], h_index)

        # Check affiliations
        affiliation = author.get('affiliation', '')
        if any(uni in affiliation for uni in top_unis):
            features['has_top_institution'] = True
        if any(company in affiliation for company in companies):
            features['has_industry_author'] = True

    return features

Step 4: Content Features

def extract_content_features(paper):
    abstract = paper['abstract'].lower()
    title = paper['title'].lower()

    return {
        'abstract_length': len(abstract.split()),
        'title_length': len(title.split()),
        'has_code': 'github.com' in abstract or 'code available' in abstract,
        'claims_sota': any(term in abstract for term in 
                          ['state-of-the-art', 'sota', 'outperform', 'beat']),
        'claims_novel': any(term in abstract for term in 
                           ['novel', 'first', 'new approach', 'propose']),
        'mentions_dataset': 'dataset' in abstract or 'benchmark' in abstract,
        'mentions_realworld': 'real-world' in abstract or 'practical' in abstract
    }

def estimate_early_signals(paper):
    """
    Since we can't get historical Twitter data easily,
    we'll estimate based on paper characteristics
    """
    signals = {
        'estimated_tweets_7d': 0,
        'estimated_github_7d': 0,
        'estimated_reddit_7d': 0
    }

    # Papers with code get more GitHub attention
    if paper['has_code']:
        signals['estimated_github_7d'] = np.random.poisson(3)

    # SOTA claims get Twitter attention
    if paper['claims_sota']:
        signals['estimated_tweets_7d'] = np.random.poisson(10)

    # Industry authors get more social buzz
    if paper['has_industry_author']:
        signals['estimated_tweets_7d'] += np.random.poisson(15)
        signals['estimated_reddit_7d'] = np.random.poisson(5)

    # Top institutions get baseline attention
    if paper['has_top_institution']:
        signals['estimated_tweets_7d'] += np.random.poisson(5)

    return signals

Day 3: Train Your Model

Step 6: Prepare Training Data

# Combine all features
feature_columns = [
    'author_count', 'max_h_index', 'has_top_institution', 'has_industry_author',
    'abstract_length', 'title_length', 'has_code', 'claims_sota', 
    'claims_novel', 'mentions_dataset', 'mentions_realworld',
    'estimated_tweets_7d', 'estimated_github_7d', 'estimated_reddit_7d'
]

X = papers_df[feature_columns]
y = papers_df['citations_current']

# Convert booleans to int
bool_columns = X.select_dtypes(include=['bool']).columns
X[bool_columns] = X[bool_columns].astype(int)

# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Step 7: Train Multiple Models

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

models = {
    'linear': LinearRegression(),
    'ridge': Ridge(alpha=1.0),
    'rf': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    'gbm': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

results = {}

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Evaluate
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[name] = {
        'mae': mae,
        'r2': r2,
        'model': model
    }

    print(f"{name}: MAE={mae:.2f}, R²={r2:.3f}")

# Save best model
best_model_name = max(results, key=lambda x: results[x]['r2'])
best_model = results[best_model_name]['model']
joblib.dump(best_model, 'oracle_model_v1.pkl')

Step 8: Feature Importance Analysis

# For Random Forest, get feature importance
if best_model_name == 'rf':
    importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\nTop 10 Most Important Features:")
    print(importance.head(10))

    # Visualize
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 6))
    plt.barh(importance.head(10)['feature'], importance.head(10)['importance'])
    plt.xlabel('Importance')
    plt.title('Top 10 Feature Importance for Citation Prediction')
    plt.tight_layout()
    plt.savefig('feature_importance.png')

Day 4: Create Prediction Pipeline

Step 9: Real-time Feature Extraction

def extract_features_for_new_paper(arxiv_id):
    """Extract all features for a fresh paper"""

    # Get paper details
    paper = fetch_arxiv_paper(arxiv_id)

    # Author features
    author_features = extract_author_features(paper['authors'])

    # Content features
    content_features = extract_content_features(paper)

    # Real social signals (not estimates)
    social_features = {
        'twitter_mentions_7d': count_twitter_mentions(arxiv_id, days=7),
        'github_repos_7d': count_github_repos(arxiv_id, days=7),
        'reddit_score_7d': get_reddit_score(arxiv_id, days=7)
    }

    # Combine all features
    features = {**author_features, **content_features, **social_features}

    return features

Step 10: Make Predictions with Confidence

def predict_impact(arxiv_id, model_path='oracle_model_v1.pkl'):
    # Load model
    model = joblib.load(model_path)

    # Get features
    features = extract_features_for_new_paper(arxiv_id)

    # Prepare for prediction
    X = pd.DataFrame([features])[feature_columns]
    X[bool_columns] = X[bool_columns].astype(int)

    # Predict
    prediction = model.predict(X)[0]

    # Calculate confidence (based on feature strength)
    confidence = calculate_confidence(features, prediction)

    # Estimate percentile
    percentile = estimate_percentile(prediction)

    return {
        'arxiv_id': arxiv_id,
        'predicted_citations_3yr': int(prediction),
        'predicted_citations_1yr': int(prediction * 0.3),  # rough estimate
        'predicted_citations_5yr': int(prediction * 1.8),  # rough estimate
        'confidence': confidence,
        'percentile': percentile,
        'features': features
    }

Day 5: Deploy as API

Step 11: Simple Flask API

from flask import Flask, request, jsonify
import joblib
import pandas as pd

app = Flask(__name__)

# Load model at startup
model = joblib.load('oracle_model_v1.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    try:
        # Get paper ID
        arxiv_id = request.json['arxiv_id']

        # Extract features
        features = extract_features_for_new_paper(arxiv_id)

        # Prepare for prediction
        X = pd.DataFrame([features])[feature_columns]
        X[bool_columns] = X[bool_columns].astype(int)

        # Predict
        prediction = model.predict(X)[0]

        # Calculate extras
        confidence = min(0.85, max(0.5, prediction / 100))  # Simple confidence
        percentile = calculate_percentile(prediction)

        return jsonify({
            'success': True,
            'prediction': {
                'citations_1yr': int(prediction * 0.3),
                'citations_3yr': int(prediction),
                'citations_5yr': int(prediction * 1.8),
                'confidence': float(confidence),
                'percentile': int(percentile)
            },
            'features_used': features
        })

    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 400

if __name__ == '__main__':
    app.run(debug=True, port=5000)

Step 12: Deploy to Production

# requirements.txt
flask==2.0.1
pandas==1.3.3
numpy==1.23.0
scikit-learn==0.24.2
joblib==1.0.1
requests==2.26.0
gunicorn==20.1.0

# Procfile (for Heroku)
web: gunicorn app:app

# Deploy
git init
git add .
git commit -m "Oracle API v1"
heroku create ai-research-oracle-api
git push heroku main

Advanced Improvements

Better Features

# Network features
'coauthor_network_size': count_unique_coauthors(author),
'author_recent_papers': count_papers_last_2_years(author),
'author_citation_velocity': author_citations_per_year,

# Topic features
'topic_papers_last_year': count_papers_in_topic(keywords),
'topic_growth_rate': calculate_topic_growth(keywords),
'keywords_trending_score': get_trending_score(keywords),

# Timing features
'published_day_of_week': paper.published_date.weekday(),
'published_near_conference': days_to_nearest_conference(date),
'competition_same_week': count_similar_papers_same_week(),

Ensemble Approach

# Train multiple models
models = {
    'rf': RandomForestRegressor(),
    'xgb': XGBRegressor(),
    'nn': build_neural_network()
}

# Weighted predictions
weights = {'rf': 0.4, 'xgb': 0.4, 'nn': 0.2}
final_prediction = sum(
    models[name].predict(X) * weight 
    for name, weight in weights.items()
)

Confidence Calibration

def calculate_confidence(features, prediction):
    # Base confidence on feature quality
    confidence = 0.5

    # Strong author signal
    if features['max_h_index'] > 30:
        confidence += 0.1

    # Strong social signal
    if features['twitter_mentions_7d'] > 20:
        confidence += 0.1

    # Has code implementation
    if features['has_code']:
        confidence += 0.05

    # Novel claims with evidence
    if features['claims_novel'] and features['has_code']:
        confidence += 0.1

    # Cap confidence
    return min(0.9, confidence)

Testing Your Oracle

Backtest on 2022 Papers

# Get 2022 papers
test_papers = collect_historical_papers(
    datetime(2022, 1, 1),
    datetime(2022, 12, 31)
)

# Make predictions
predictions = []
for _, paper in test_papers.iterrows():
    pred = predict_impact(paper['arxiv_id'])
    predictions.append(pred)

# Wait for actual citations (or check now for 2022)
# Calculate accuracy metrics

A/B Test Content

# Version A: Just prediction
"This paper will get 156 citations"

# Version B: With confidence  
"This paper will get 156 citations (78% confidence)"

# Version C: With explanation
"This paper will get 156 citations because:
- Lead author h-index: 45
- 25 tweets in first week  
- 3 GitHub implementations"

# Track which gets more engagement

Launch Checklist

Technical

[ ] Model trained on 10,000+ papers
[ ] API deployed and tested
[ ] Prediction accuracy >60%
[ ] Feature extraction automated
[ ] Error handling robust

Content

[ ] 20 predictions ready
[ ] Methodology documented
[ ] Accuracy disclaimer clear
[ ] Success stories prepared
[ ] FAQs answered

Business

[ ] Pricing strategy set
[ ] Legal review done
[ ] Support system ready
[ ] Tracking analytics live
[ ] Press kit prepared

Remember

Start Simple: Linear regression beats no prediction
Be Transparent: Share your accuracy openly
Iterate Fast: New model version every month
Focus on Value: Even 60% accuracy helps researchers
Build Trust: Admit mistakes, celebrate wins

This guide gets you from zero to working Oracle in 5 days. The key is starting simple and improving based on real data. Your first model won't be perfect, but it will be infinitely better than waiting 3 years for citations! 🔮