help@rskworld.in +91 93305 39277
RSK World
  • Home
  • Development
    • Web Development
    • Mobile Apps
    • Software
    • Games
    • Project
  • Technologies
    • Data Science
    • AI Development
    • Cloud Development
    • Blockchain
    • Cyber Security
    • Dev Tools
    • Testing Tools
  • About
  • Contact

Theme Settings

Color Scheme
Display Options
Font Size
100%
Back to Project
RSK World
sentiment-analysis
/
scripts
RSK World
sentiment-analysis
Sentiment Analysis Dataset - NLP + Text Classification + Machine Learning
scripts
  • analyze_sentiment.py20.9 KB
  • generate_data.py20.8 KB
  • preprocess_data.py21.9 KB
  • requirements.txt2.2 KB
  • train_model.py16.2 KB
  • visualize_data.py24.7 KB
train_model.pytranscripts.jsonprocess_data.pyvisualize_data.pypreprocess_data.py
scripts/train_model.py
Raw Download
Find: Go to:
#!/usr/bin/env python3
"""
================================================================================
 * Sentiment Analysis Dataset - Model Training Script
 * 
 * Project: Sentiment Analysis Dataset
 * Description: Train machine learning models for sentiment classification
 *              using various algorithms including Naive Bayes, SVM, and more.
 * Category: Text Data
 * Difficulty: Intermediate
 * 
 * Author: Molla Samser (Founder)
 * Designer & Tester: Rima Khatun
 * Website: https://rskworld.in
 * Email: help@rskworld.in | support@rskworld.in
 * Phone: +91 93305 39277
 * 
 * © 2026 RSK World - Free Programming Resources & Source Code
 * All rights reserved.
================================================================================

Usage:
    python train_model.py --train ../data/train_data.csv --test ../data/test_data.csv
    python train_model.py --input ../data/sentiment_data.csv --model svm --split 0.8
    python train_model.py --input ../data/sentiment_data.csv --all-models --save
"""

import argparse
import csv
import json
import os
import pickle
import re
import string
from typing import List, Dict, Tuple, Optional
from collections import Counter
from datetime import datetime

# Try to import ML libraries
try:
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.svm import LinearSVC
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split, cross_val_score
    from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
    from sklearn.pipeline import Pipeline
    SKLEARN_AVAILABLE = True
except ImportError:
    SKLEARN_AVAILABLE = False

try:
    import numpy as np
    NUMPY_AVAILABLE = True
except ImportError:
    NUMPY_AVAILABLE = False


# ============================================
# Data Loading and Preprocessing
# ============================================

def load_data(filepath: str) -> List[Dict]:
    """Load data from CSV or JSON file."""
    ext = os.path.splitext(filepath)[1].lower()
    
    if ext == '.csv':
        data = []
        with open(filepath, 'r', encoding='utf-8') as f:
            lines = [line for line in f if not line.strip().startswith('#')]
            reader = csv.DictReader(lines)
            for row in reader:
                data.append(row)
        return data
    
    elif ext == '.json':
        with open(filepath, 'r', encoding='utf-8') as f:
            content = json.load(f)
            if isinstance(content, dict) and 'data' in content:
                return content['data']
            return content
    
    raise ValueError(f"Unsupported file format: {ext}")


def preprocess_text(text: str) -> str:
    """Basic text preprocessing."""
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text


def prepare_data(data: List[Dict]) -> Tuple[List[str], List[str]]:
    """Prepare texts and labels from data."""
    texts = []
    labels = []
    
    for sample in data:
        text = sample.get('text', '')
        label = sample.get('sentiment', 'neutral')
        
        if text and label:
            texts.append(preprocess_text(text))
            labels.append(label)
    
    return texts, labels


# ============================================
# Model Training
# ============================================

class SentimentModelTrainer:
    """Train and evaluate sentiment classification models."""
    
    def __init__(self, vectorizer_type: str = 'tfidf'):
        self.vectorizer_type = vectorizer_type
        self.models = {}
        self.best_model = None
        self.best_model_name = None
        self.vectorizer = None
        
        # Available models
        if SKLEARN_AVAILABLE:
            self.available_models = {
                'naive_bayes': MultinomialNB(),
                'svm': LinearSVC(max_iter=10000),
                'logistic_regression': LogisticRegression(max_iter=1000),
                'random_forest': RandomForestClassifier(n_estimators=100, n_jobs=-1)
            }
    
    def create_vectorizer(self) -> object:
        """Create text vectorizer."""
        if self.vectorizer_type == 'tfidf':
            return TfidfVectorizer(
                max_features=5000,
                ngram_range=(1, 2),
                min_df=2,
                max_df=0.95
            )
        else:
            return CountVectorizer(
                max_features=5000,
                ngram_range=(1, 2),
                min_df=2,
                max_df=0.95
            )
    
    def train(
        self,
        X_train: List[str],
        y_train: List[str],
        model_name: str = 'naive_bayes'
    ) -> object:
        """Train a single model."""
        if not SKLEARN_AVAILABLE:
            raise ImportError("scikit-learn is required for model training")
        
        if model_name not in self.available_models:
            raise ValueError(f"Unknown model: {model_name}")
        
        # Create pipeline
        self.vectorizer = self.create_vectorizer()
        model = self.available_models[model_name]
        
        pipeline = Pipeline([
            ('vectorizer', self.vectorizer),
            ('classifier', model)
        ])
        
        # Train
        pipeline.fit(X_train, y_train)
        self.models[model_name] = pipeline
        
        return pipeline
    
    def train_all(
        self,
        X_train: List[str],
        y_train: List[str]
    ) -> Dict[str, object]:
        """Train all available models."""
        results = {}
        
        for model_name in self.available_models:
            print(f"  Training {model_name}...")
            pipeline = self.train(X_train, y_train, model_name)
            results[model_name] = pipeline
        
        return results
    
    def evaluate(
        self,
        X_test: List[str],
        y_test: List[str],
        model_name: Optional[str] = None
    ) -> Dict:
        """Evaluate model(s) on test data."""
        results = {}
        
        models_to_evaluate = {model_name: self.models[model_name]} if model_name else self.models
        
        best_accuracy = 0
        
        for name, pipeline in models_to_evaluate.items():
            y_pred = pipeline.predict(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            report = classification_report(y_test, y_pred, output_dict=True)
            
            results[name] = {
                'accuracy': accuracy,
                'classification_report': report,
                'predictions': y_pred.tolist() if hasattr(y_pred, 'tolist') else list(y_pred)
            }
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                self.best_model = pipeline
                self.best_model_name = name
        
        return results
    
    def cross_validate(
        self,
        X: List[str],
        y: List[str],
        model_name: str = 'naive_bayes',
        cv: int = 5
    ) -> Dict:
        """Perform cross-validation."""
        if not SKLEARN_AVAILABLE:
            raise ImportError("scikit-learn is required")
        
        vectorizer = self.create_vectorizer()
        model = self.available_models[model_name]
        
        pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', model)
        ])
        
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
        
        return {
            'model': model_name,
            'cv_folds': cv,
            'scores': scores.tolist(),
            'mean_accuracy': scores.mean(),
            'std_accuracy': scores.std()
        }
    
    def predict(self, texts: List[str], model_name: Optional[str] = None) -> List[str]:
        """Make predictions on new texts."""
        if model_name:
            pipeline = self.models.get(model_name)
        else:
            pipeline = self.best_model
        
        if not pipeline:
            raise ValueError("No trained model available")
        
        # Preprocess texts
        processed_texts = [preprocess_text(t) for t in texts]
        
        return pipeline.predict(processed_texts).tolist()
    
    def save_model(self, filepath: str, model_name: Optional[str] = None):
        """Save trained model to file."""
        if model_name:
            pipeline = self.models.get(model_name)
        else:
            pipeline = self.best_model
            model_name = self.best_model_name
        
        if not pipeline:
            raise ValueError("No model to save")
        
        os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
        
        with open(filepath, 'wb') as f:
            pickle.dump({
                'pipeline': pipeline,
                'model_name': model_name,
                'saved_at': datetime.now().isoformat()
            }, f)
        
        print(f"✓ Model saved to {filepath}")
    
    @staticmethod
    def load_model(filepath: str) -> Tuple[object, str]:
        """Load trained model from file."""
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
        
        return data['pipeline'], data['model_name']


# ============================================
# Main Function
# ============================================

def main():
    parser = argparse.ArgumentParser(
        description="Train Sentiment Classification Models - RSK World",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python train_model.py --input ../data/sentiment_data.csv --model naive_bayes
  python train_model.py --train ../data/train_data.csv --test ../data/test_data.csv --all-models
  python train_model.py --input ../data/sentiment_data.csv --model svm --save --output ./models/

Author: Molla Samser (Founder) - RSK World
Website: https://rskworld.in
        """
    )
    
    parser.add_argument(
        "--input", "-i",
        type=str,
        help="Input data file (will be split into train/test)"
    )
    
    parser.add_argument(
        "--train",
        type=str,
        help="Training data file"
    )
    
    parser.add_argument(
        "--test",
        type=str,
        help="Test data file"
    )
    
    parser.add_argument(
        "--model", "-m",
        type=str,
        choices=['naive_bayes', 'svm', 'logistic_regression', 'random_forest'],
        default='naive_bayes',
        help="Model to train (default: naive_bayes)"
    )
    
    parser.add_argument(
        "--all-models", "-a",
        action="store_true",
        help="Train all available models"
    )
    
    parser.add_argument(
        "--split", "-s",
        type=float,
        default=0.8,
        help="Train/test split ratio (default: 0.8)"
    )
    
    parser.add_argument(
        "--vectorizer", "-v",
        type=str,
        choices=['tfidf', 'count'],
        default='tfidf',
        help="Vectorizer type (default: tfidf)"
    )
    
    parser.add_argument(
        "--cross-validate", "-cv",
        type=int,
        default=0,
        help="Number of cross-validation folds (0 = no CV)"
    )
    
    parser.add_argument(
        "--save",
        action="store_true",
        help="Save the best model"
    )
    
    parser.add_argument(
        "--output", "-o",
        type=str,
        default="./models",
        help="Output directory for saved models"
    )
    
    args = parser.parse_args()
    
    print("""
╔══════════════════════════════════════════════════════════════════╗
║         RSK World - Sentiment Model Trainer                      ║
║                                                                  ║
║  Author: Molla Samser (Founder)                                  ║
║  Website: https://rskworld.in                                    ║
║  © 2026 RSK World - Free Programming Resources & Source Code     ║
╚══════════════════════════════════════════════════════════════════╝
    """)
    
    # Check for scikit-learn
    if not SKLEARN_AVAILABLE:
        print("❌ Error: scikit-learn is required for model training.")
        print("   Install with: pip install scikit-learn")
        return
    
    # Load data
    if args.input:
        print(f"Loading data from {args.input}...")
        data = load_data(args.input)
        texts, labels = prepare_data(data)
        
        print(f"Total samples: {len(texts)}")
        print(f"Splitting data ({args.split:.0%} train, {1-args.split:.0%} test)...")
        
        X_train, X_test, y_train, y_test = train_test_split(
            texts, labels, train_size=args.split, random_state=42, stratify=labels
        )
    
    elif args.train and args.test:
        print(f"Loading training data from {args.train}...")
        train_data = load_data(args.train)
        X_train, y_train = prepare_data(train_data)
        
        print(f"Loading test data from {args.test}...")
        test_data = load_data(args.test)
        X_test, y_test = prepare_data(test_data)
    
    else:
        print("❌ Error: Provide either --input or both --train and --test")
        return
    
    print(f"\nTraining samples: {len(X_train)}")
    print(f"Test samples: {len(X_test)}")
    print(f"Classes: {set(y_train)}")
    print()
    
    # Initialize trainer
    trainer = SentimentModelTrainer(vectorizer_type=args.vectorizer)
    
    # Cross-validation if requested
    if args.cross_validate > 0:
        print(f"Performing {args.cross_validate}-fold cross-validation...")
        cv_results = trainer.cross_validate(X_train, y_train, args.model, args.cross_validate)
        print(f"  Mean accuracy: {cv_results['mean_accuracy']:.4f} (+/- {cv_results['std_accuracy']:.4f})")
        print()
    
    # Train models
    if args.all_models:
        print("Training all models...")
        trainer.train_all(X_train, y_train)
    else:
        print(f"Training {args.model}...")
        trainer.train(X_train, y_train, args.model)
    
    print()
    
    # Evaluate
    print("Evaluating models...")
    print("=" * 60)
    
    results = trainer.evaluate(X_test, y_test)
    
    for model_name, result in results.items():
        print(f"\n{model_name.upper()}")
        print("-" * 40)
        print(f"Accuracy: {result['accuracy']:.4f} ({result['accuracy']:.2%})")
        print()
        
        report = result['classification_report']
        print(f"{'Class':<12} {'Precision':<12} {'Recall':<12} {'F1-Score':<12}")
        print("-" * 48)
        
        for cls in ['positive', 'neutral', 'negative']:
            if cls in report:
                m = report[cls]
                print(f"{cls:<12} {m['precision']:<12.4f} {m['recall']:<12.4f} {m['f1-score']:<12.4f}")
        
        print("-" * 48)
        print(f"{'Macro Avg':<12} {report['macro avg']['precision']:<12.4f} "
              f"{report['macro avg']['recall']:<12.4f} {report['macro avg']['f1-score']:<12.4f}")
    
    # Print best model
    print()
    print("=" * 60)
    print(f"🏆 Best Model: {trainer.best_model_name}")
    print(f"   Accuracy: {results[trainer.best_model_name]['accuracy']:.2%}")
    
    # Save model if requested
    if args.save:
        os.makedirs(args.output, exist_ok=True)
        model_path = os.path.join(args.output, f'{trainer.best_model_name}_model.pkl')
        trainer.save_model(model_path)
    
    print()
    print("✓ Training complete!")
    print()


if __name__ == "__main__":
    main()

506 lines•16.2 KB
python
scripts/visualize_data.py
Raw Download
Find: Go to:
#!/usr/bin/env python3
"""
================================================================================
 * Sentiment Analysis Dataset - Data Visualization Script
 * 
 * Project: Sentiment Analysis Dataset
 * Description: Generate visualizations and statistics for sentiment analysis
 *              datasets including distribution charts, word clouds, and more.
 * Category: Text Data
 * Difficulty: Intermediate
 * 
 * Author: Molla Samser (Founder)
 * Designer & Tester: Rima Khatun
 * Website: https://rskworld.in
 * Email: help@rskworld.in | support@rskworld.in
 * Phone: +91 93305 39277
 * 
 * © 2026 RSK World - Free Programming Resources & Source Code
 * All rights reserved.
================================================================================

Usage:
    python visualize_data.py --input ./data/sentiment_data.csv
    python visualize_data.py --input ./data/sentiment_data.json --output ./charts/
    python visualize_data.py --input ./data/ --all-charts --interactive
"""

import argparse
import csv
import json
import os
import re
from typing import List, Dict, Optional
from collections import Counter
from datetime import datetime

# Try to import visualization libraries
try:
    import matplotlib
    matplotlib.use('Agg')  # Non-interactive backend
    import matplotlib.pyplot as plt
    MATPLOTLIB_AVAILABLE = True
except ImportError:
    MATPLOTLIB_AVAILABLE = False

try:
    from wordcloud import WordCloud
    WORDCLOUD_AVAILABLE = True
except ImportError:
    WORDCLOUD_AVAILABLE = False

try:
    import numpy as np
    NUMPY_AVAILABLE = True
except ImportError:
    NUMPY_AVAILABLE = False


# ============================================
# Data Loading
# ============================================

def load_data(filepath: str) -> List[Dict]:
    """Load data from CSV or JSON file."""
    ext = os.path.splitext(filepath)[1].lower()
    
    if ext == '.csv':
        data = []
        with open(filepath, 'r', encoding='utf-8') as f:
            lines = [line for line in f if not line.strip().startswith('#')]
            reader = csv.DictReader(lines)
            for row in reader:
                data.append(row)
        return data
    
    elif ext == '.json':
        with open(filepath, 'r', encoding='utf-8') as f:
            content = json.load(f)
            if isinstance(content, dict) and 'data' in content:
                return content['data']
            return content
    
    raise ValueError(f"Unsupported file format: {ext}")


# ============================================
# Statistics Functions
# ============================================

def calculate_statistics(data: List[Dict]) -> Dict:
    """Calculate comprehensive statistics for the dataset."""
    total = len(data)
    
    # Sentiment distribution
    sentiments = [d.get('sentiment', 'unknown') for d in data]
    sentiment_counts = Counter(sentiments)
    
    # Source distribution
    sources = [d.get('source', 'unknown') for d in data]
    source_counts = Counter(sources)
    
    # Text length statistics
    text_lengths = [len(d.get('text', '')) for d in data]
    word_counts = [len(d.get('text', '').split()) for d in data]
    
    avg_text_length = sum(text_lengths) / total if total > 0 else 0
    avg_word_count = sum(word_counts) / total if total > 0 else 0
    min_text_length = min(text_lengths) if text_lengths else 0
    max_text_length = max(text_lengths) if text_lengths else 0
    
    # Date range
    dates = [d.get('date', '') for d in data if d.get('date')]
    
    return {
        "total_samples": total,
        "sentiment_distribution": dict(sentiment_counts),
        "source_distribution": dict(source_counts),
        "text_statistics": {
            "avg_length": round(avg_text_length, 2),
            "avg_word_count": round(avg_word_count, 2),
            "min_length": min_text_length,
            "max_length": max_text_length
        },
        "date_range": {
            "earliest": min(dates) if dates else None,
            "latest": max(dates) if dates else None
        }
    }


def get_top_words(data: List[Dict], sentiment: Optional[str] = None, top_n: int = 50) -> List[tuple]:
    """Get most frequent words, optionally filtered by sentiment."""
    words = []
    
    for sample in data:
        if sentiment and sample.get('sentiment') != sentiment:
            continue
        
        text = sample.get('text', '').lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        words.extend(text.split())
    
    # Remove common stopwords
    stopwords = {
        'the', 'a', 'an', 'is', 'it', 'to', 'and', 'of', 'in', 'for', 'on',
        'with', 'as', 'was', 'that', 'this', 'i', 'my', 'but', 'have', 'has',
        'be', 'are', 'been', 'will', 'would', 'could', 'should', 'from', 'at',
        'or', 'by', 'so', 'if', 'just', 'what', 'all', 'were', 'we', 'they'
    }
    
    filtered_words = [w for w in words if w not in stopwords and len(w) > 2]
    word_counts = Counter(filtered_words)
    
    return word_counts.most_common(top_n)


# ============================================
# Visualization Functions
# ============================================

def set_style():
    """Set matplotlib style for RSK World branding."""
    if not MATPLOTLIB_AVAILABLE:
        return
    
    plt.style.use('dark_background')
    plt.rcParams.update({
        'figure.facecolor': '#0d0d0d',
        'axes.facecolor': '#1a1a1a',
        'axes.edgecolor': '#333333',
        'axes.labelcolor': '#ffffff',
        'text.color': '#ffffff',
        'xtick.color': '#b3b3b3',
        'ytick.color': '#b3b3b3',
        'grid.color': '#333333',
        'font.family': 'sans-serif',
        'font.size': 10
    })


def plot_sentiment_distribution(data: List[Dict], output_path: str):
    """Create sentiment distribution pie chart."""
    if not MATPLOTLIB_AVAILABLE:
        print("⚠ Matplotlib not installed. Skipping chart generation.")
        return
    
    set_style()
    
    sentiments = [d.get('sentiment', 'unknown') for d in data]
    counts = Counter(sentiments)
    
    labels = list(counts.keys())
    values = list(counts.values())
    
    # RSK World color scheme
    colors = {
        'positive': '#28a745',
        'neutral': '#ffc107',
        'negative': '#dc3545',
        'unknown': '#6c757d'
    }
    chart_colors = [colors.get(l, '#6c757d') for l in labels]
    
    fig, ax = plt.subplots(figsize=(10, 8), facecolor='#0d0d0d')
    
    wedges, texts, autotexts = ax.pie(
        values,
        labels=labels,
        autopct='%1.1f%%',
        colors=chart_colors,
        explode=[0.02] * len(labels),
        shadow=True,
        startangle=90
    )
    
    # Style the text
    for text in texts:
        text.set_color('white')
        text.set_fontsize(12)
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontweight('bold')
    
    ax.set_title('Sentiment Distribution\nRSK World - Sentiment Analysis Dataset', 
                 fontsize=14, fontweight='bold', color='white', pad=20)
    
    # Add legend
    ax.legend(wedges, [f'{l.capitalize()}: {v}' for l, v in zip(labels, values)],
              loc='lower right', fontsize=10)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, facecolor='#0d0d0d', edgecolor='none', bbox_inches='tight')
    plt.close()
    
    print(f"✓ Saved sentiment distribution chart to {output_path}")


def plot_source_distribution(data: List[Dict], output_path: str):
    """Create source distribution bar chart."""
    if not MATPLOTLIB_AVAILABLE:
        return
    
    set_style()
    
    sources = [d.get('source', 'unknown') for d in data]
    counts = Counter(sources)
    
    labels = list(counts.keys())
    values = list(counts.values())
    
    fig, ax = plt.subplots(figsize=(12, 6), facecolor='#0d0d0d')
    
    # Create gradient-like colors
    colors = ['#dc3545', '#e35d6a', '#e8838e', '#eda9b2', '#f2ced6'][:len(labels)]
    
    bars = ax.bar(labels, values, color=colors, edgecolor='#333333', linewidth=1)
    
    # Add value labels on bars
    for bar, value in zip(bars, values):
        height = bar.get_height()
        ax.annotate(f'{value}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom',
                    color='white', fontweight='bold')
    
    ax.set_xlabel('Data Source', fontsize=12, color='white')
    ax.set_ylabel('Number of Samples', fontsize=12, color='white')
    ax.set_title('Data Source Distribution\nRSK World - Sentiment Analysis Dataset',
                 fontsize=14, fontweight='bold', color='white', pad=20)
    
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, facecolor='#0d0d0d', edgecolor='none', bbox_inches='tight')
    plt.close()
    
    print(f"✓ Saved source distribution chart to {output_path}")


def plot_text_length_histogram(data: List[Dict], output_path: str):
    """Create text length histogram."""
    if not MATPLOTLIB_AVAILABLE:
        return
    
    set_style()
    
    # Separate by sentiment
    lengths_by_sentiment = {
        'positive': [],
        'neutral': [],
        'negative': []
    }
    
    for sample in data:
        sentiment = sample.get('sentiment', 'neutral')
        length = len(sample.get('text', '').split())
        if sentiment in lengths_by_sentiment:
            lengths_by_sentiment[sentiment].append(length)
    
    fig, ax = plt.subplots(figsize=(12, 6), facecolor='#0d0d0d')
    
    colors = {'positive': '#28a745', 'neutral': '#ffc107', 'negative': '#dc3545'}
    
    for sentiment, lengths in lengths_by_sentiment.items():
        if lengths:
            ax.hist(lengths, bins=20, alpha=0.6, label=sentiment.capitalize(),
                   color=colors[sentiment], edgecolor='white', linewidth=0.5)
    
    ax.set_xlabel('Word Count', fontsize=12, color='white')
    ax.set_ylabel('Frequency', fontsize=12, color='white')
    ax.set_title('Text Length Distribution by Sentiment\nRSK World - Sentiment Analysis Dataset',
                 fontsize=14, fontweight='bold', color='white', pad=20)
    ax.legend(loc='upper right')
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, facecolor='#0d0d0d', edgecolor='none', bbox_inches='tight')
    plt.close()
    
    print(f"✓ Saved text length histogram to {output_path}")


def plot_word_frequency(data: List[Dict], output_path: str, top_n: int = 20):
    """Create word frequency bar chart."""
    if not MATPLOTLIB_AVAILABLE:
        return
    
    set_style()
    
    top_words = get_top_words(data, top_n=top_n)
    
    words = [w for w, c in top_words]
    counts = [c for w, c in top_words]
    
    fig, ax = plt.subplots(figsize=(14, 8), facecolor='#0d0d0d')
    
    # Create horizontal bar chart
    y_pos = range(len(words))
    bars = ax.barh(y_pos, counts, color='#dc3545', edgecolor='#333333', linewidth=1)
    
    ax.set_yticks(y_pos)
    ax.set_yticklabels(words)
    ax.invert_yaxis()
    
    # Add value labels
    for i, (bar, count) in enumerate(zip(bars, counts)):
        ax.annotate(f'{count}',
                    xy=(bar.get_width(), bar.get_y() + bar.get_height()/2),
                    xytext=(5, 0),
                    textcoords="offset points",
                    ha='left', va='center',
                    color='white', fontsize=9)
    
    ax.set_xlabel('Frequency', fontsize=12, color='white')
    ax.set_title(f'Top {top_n} Most Frequent Words\nRSK World - Sentiment Analysis Dataset',
                 fontsize=14, fontweight='bold', color='white', pad=20)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, facecolor='#0d0d0d', edgecolor='none', bbox_inches='tight')
    plt.close()
    
    print(f"✓ Saved word frequency chart to {output_path}")


def generate_wordcloud(data: List[Dict], output_path: str, sentiment: Optional[str] = None):
    """Generate word cloud."""
    if not WORDCLOUD_AVAILABLE:
        print("⚠ WordCloud not installed. Skipping word cloud generation.")
        return
    
    # Collect text
    texts = []
    for sample in data:
        if sentiment and sample.get('sentiment') != sentiment:
            continue
        texts.append(sample.get('text', ''))
    
    text = ' '.join(texts)
    
    # Color based on sentiment
    if sentiment == 'positive':
        colormap = 'Greens'
    elif sentiment == 'negative':
        colormap = 'Reds'
    else:
        colormap = 'Blues'
    
    wordcloud = WordCloud(
        width=1200,
        height=600,
        background_color='#0d0d0d',
        colormap=colormap,
        max_words=100,
        min_font_size=10,
        max_font_size=150
    ).generate(text)
    
    if MATPLOTLIB_AVAILABLE:
        fig, ax = plt.subplots(figsize=(12, 6), facecolor='#0d0d0d')
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.axis('off')
        
        title = f'Word Cloud - {sentiment.capitalize() if sentiment else "All"} Sentiment'
        ax.set_title(f'{title}\nRSK World - Sentiment Analysis Dataset',
                     fontsize=14, fontweight='bold', color='white', pad=20)
        
        plt.tight_layout()
        plt.savefig(output_path, dpi=150, facecolor='#0d0d0d', edgecolor='none', bbox_inches='tight')
        plt.close()
    else:
        wordcloud.to_file(output_path)
    
    print(f"✓ Saved word cloud to {output_path}")


def generate_html_report(data: List[Dict], stats: Dict, output_path: str, chart_dir: str):
    """Generate an HTML report with embedded statistics."""
    
    html_content = f"""<!DOCTYPE html>
<!--
================================================================================
 * Sentiment Analysis Dataset - Statistics Report
 * 
 * Author: Molla Samser (Founder)
 * Designer & Tester: Rima Khatun
 * Website: https://rskworld.in
 * © 2026 RSK World - Free Programming Resources & Source Code
================================================================================
-->
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Dataset Statistics Report - RSK World</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            background: linear-gradient(135deg, #1a0a0a 0%, #0d0d0d 50%, #0a0a1a 100%);
            color: #ffffff;
            min-height: 100vh;
            padding: 40px;
        }}
        .container {{ max-width: 1200px; margin: 0 auto; }}
        .header {{
            text-align: center;
            margin-bottom: 40px;
            padding: 30px;
            background: rgba(220, 53, 69, 0.1);
            border-radius: 15px;
            border: 1px solid rgba(220, 53, 69, 0.3);
        }}
        .header h1 {{ color: #dc3545; font-size: 2.5em; margin-bottom: 10px; }}
        .header p {{ color: #b3b3b3; }}
        .stats-grid {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
            gap: 20px;
            margin-bottom: 40px;
        }}
        .stat-card {{
            background: #1e1e1e;
            padding: 25px;
            border-radius: 12px;
            border: 1px solid #333;
            text-align: center;
        }}
        .stat-card h3 {{ color: #dc3545; font-size: 2em; margin-bottom: 10px; }}
        .stat-card p {{ color: #b3b3b3; }}
        .section {{
            background: #1e1e1e;
            padding: 30px;
            border-radius: 12px;
            border: 1px solid #333;
            margin-bottom: 30px;
        }}
        .section h2 {{
            color: #dc3545;
            margin-bottom: 20px;
            padding-bottom: 10px;
            border-bottom: 1px solid #333;
        }}
        .chart-container {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
            gap: 20px;
        }}
        .chart {{ background: #0d0d0d; padding: 15px; border-radius: 8px; }}
        .chart img {{ width: 100%; height: auto; border-radius: 5px; }}
        table {{
            width: 100%;
            border-collapse: collapse;
            margin-top: 15px;
        }}
        th, td {{
            padding: 12px;
            text-align: left;
            border-bottom: 1px solid #333;
        }}
        th {{ color: #dc3545; background: #0d0d0d; }}
        .footer {{
            text-align: center;
            padding: 20px;
            color: #6c757d;
            margin-top: 40px;
        }}
        .footer a {{ color: #dc3545; text-decoration: none; }}
    </style>
</head>
<body>
    <div class="container">
        <div class="header">
            <h1>📊 Dataset Statistics Report</h1>
            <p>Sentiment Analysis Dataset - RSK World</p>
            <p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
        </div>
        
        <div class="stats-grid">
            <div class="stat-card">
                <h3>{stats['total_samples']:,}</h3>
                <p>Total Samples</p>
            </div>
            <div class="stat-card">
                <h3>{stats['sentiment_distribution'].get('positive', 0):,}</h3>
                <p>Positive Samples</p>
            </div>
            <div class="stat-card">
                <h3>{stats['sentiment_distribution'].get('neutral', 0):,}</h3>
                <p>Neutral Samples</p>
            </div>
            <div class="stat-card">
                <h3>{stats['sentiment_distribution'].get('negative', 0):,}</h3>
                <p>Negative Samples</p>
            </div>
        </div>
        
        <div class="section">
            <h2>📈 Text Statistics</h2>
            <table>
                <tr><th>Metric</th><th>Value</th></tr>
                <tr><td>Average Text Length</td><td>{stats['text_statistics']['avg_length']:.1f} characters</td></tr>
                <tr><td>Average Word Count</td><td>{stats['text_statistics']['avg_word_count']:.1f} words</td></tr>
                <tr><td>Min Text Length</td><td>{stats['text_statistics']['min_length']} characters</td></tr>
                <tr><td>Max Text Length</td><td>{stats['text_statistics']['max_length']} characters</td></tr>
            </table>
        </div>
        
        <div class="section">
            <h2>📊 Visualizations</h2>
            <div class="chart-container">
                <div class="chart">
                    <img src="sentiment_distribution.png" alt="Sentiment Distribution">
                </div>
                <div class="chart">
                    <img src="source_distribution.png" alt="Source Distribution">
                </div>
                <div class="chart">
                    <img src="text_length_histogram.png" alt="Text Length Histogram">
                </div>
                <div class="chart">
                    <img src="word_frequency.png" alt="Word Frequency">
                </div>
            </div>
        </div>
        
        <div class="footer">
            <p>© 2026 RSK World - Free Programming Resources & Source Code</p>
            <p>Author: <strong>Molla Samser</strong> | Designer: <strong>Rima Khatun</strong></p>
            <p><a href="https://rskworld.in">rskworld.in</a> | help@rskworld.in</p>
        </div>
    </div>
</body>
</html>
"""
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(html_content)
    
    print(f"✓ Generated HTML report: {output_path}")


# ============================================
# Main Function
# ============================================

def main():
    parser = argparse.ArgumentParser(
        description="Data Visualization Tool - RSK World",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python visualize_data.py --input ../data/sentiment_data.csv
  python visualize_data.py --input ../data/sentiment_data.json --output ./charts/
  python visualize_data.py --input ../data/sentiment_data.csv --all-charts

Author: Molla Samser (Founder) - RSK World
Website: https://rskworld.in
        """
    )
    
    parser.add_argument(
        "--input", "-i",
        type=str,
        required=True,
        help="Input file path (CSV or JSON)"
    )
    
    parser.add_argument(
        "--output", "-o",
        type=str,
        default="./charts",
        help="Output directory for charts (default: ./charts)"
    )
    
    parser.add_argument(
        "--all-charts", "-a",
        action="store_true",
        help="Generate all available charts"
    )
    
    parser.add_argument(
        "--stats-only", "-s",
        action="store_true",
        help="Only print statistics, no charts"
    )
    
    parser.add_argument(
        "--html-report", "-r",
        action="store_true",
        help="Generate HTML report"
    )
    
    args = parser.parse_args()
    
    print("""
╔══════════════════════════════════════════════════════════════════╗
║         RSK World - Data Visualization Tool                      ║
║                                                                  ║
║  Author: Molla Samser (Founder)                                  ║
║  Website: https://rskworld.in                                    ║
║  © 2026 RSK World - Free Programming Resources & Source Code     ║
╚══════════════════════════════════════════════════════════════════╝
    """)
    
    # Check for required libraries
    print("Available features:")
    print(f"  {'✓' if MATPLOTLIB_AVAILABLE else '✗'} Charts (matplotlib)")
    print(f"  {'✓' if WORDCLOUD_AVAILABLE else '✗'} Word Clouds (wordcloud)")
    print()
    
    # Load data
    print(f"Loading data from {args.input}...")
    data = load_data(args.input)
    print(f"Loaded {len(data)} samples")
    print()
    
    # Calculate statistics
    stats = calculate_statistics(data)
    
    # Print statistics
    print("=" * 50)
    print("DATASET STATISTICS")
    print("=" * 50)
    print(f"Total samples: {stats['total_samples']:,}")
    print()
    print("Sentiment Distribution:")
    for sentiment, count in stats['sentiment_distribution'].items():
        pct = count / stats['total_samples'] * 100
        print(f"  {sentiment.capitalize()}: {count:,} ({pct:.1f}%)")
    print()
    print("Source Distribution:")
    for source, count in stats['source_distribution'].items():
        print(f"  {source}: {count:,}")
    print()
    print("Text Statistics:")
    print(f"  Average length: {stats['text_statistics']['avg_length']:.1f} chars")
    print(f"  Average words: {stats['text_statistics']['avg_word_count']:.1f}")
    print()
    
    if args.stats_only:
        return
    
    # Create output directory
    os.makedirs(args.output, exist_ok=True)
    
    # Generate charts
    if MATPLOTLIB_AVAILABLE:
        print("\nGenerating charts...")
        
        plot_sentiment_distribution(
            data, 
            os.path.join(args.output, 'sentiment_distribution.png')
        )
        
        plot_source_distribution(
            data,
            os.path.join(args.output, 'source_distribution.png')
        )
        
        plot_text_length_histogram(
            data,
            os.path.join(args.output, 'text_length_histogram.png')
        )
        
        plot_word_frequency(
            data,
            os.path.join(args.output, 'word_frequency.png')
        )
        
        if args.all_charts and WORDCLOUD_AVAILABLE:
            print("\nGenerating word clouds...")
            generate_wordcloud(data, os.path.join(args.output, 'wordcloud_all.png'))
            generate_wordcloud(data, os.path.join(args.output, 'wordcloud_positive.png'), 'positive')
            generate_wordcloud(data, os.path.join(args.output, 'wordcloud_negative.png'), 'negative')
    
    # Generate HTML report
    if args.html_report:
        generate_html_report(
            data, stats,
            os.path.join(args.output, 'report.html'),
            args.output
        )
    
    print()
    print("✓ Visualization complete!")
    print(f"  Output directory: {args.output}")
    print()


if __name__ == "__main__":
    main()

733 lines•24.7 KB
python
scripts/preprocess_data.py
Raw Download
Find: Go to:
#!/usr/bin/env python3
"""
================================================================================
 * Sentiment Analysis Dataset - Data Preprocessing Script
 * 
 * Project: Sentiment Analysis Dataset
 * Description: Preprocess text data for NLP models - cleaning, tokenization,
 *              normalization, and feature extraction.
 * Category: Text Data
 * Difficulty: Intermediate
 * 
 * Author: Molla Samser (Founder)
 * Designer & Tester: Rima Khatun
 * Website: https://rskworld.in
 * Email: help@rskworld.in | support@rskworld.in
 * Phone: +91 93305 39277
 * 
 * © 2026 RSK World - Free Programming Resources & Source Code
 * All rights reserved.
================================================================================

Usage:
    python preprocess_data.py --input ./data/sentiment_data.csv --output ./preprocessed/
    python preprocess_data.py --input ./data/sentiment_data.json --lowercase --remove-stopwords
    python preprocess_data.py --input ./data/ --batch --lemmatize
"""

import argparse
import csv
import json
import os
import re
import string
from typing import List, Dict, Optional, Tuple
from datetime import datetime
from collections import Counter

# Try to import optional NLP libraries
try:
    import nltk
    from nltk.tokenize import word_tokenize, sent_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    NLTK_AVAILABLE = True
except ImportError:
    NLTK_AVAILABLE = False

try:
    import spacy
    SPACY_AVAILABLE = True
except ImportError:
    SPACY_AVAILABLE = False


# ============================================
# Text Preprocessing Functions
# ============================================

class TextPreprocessor:
    """Advanced text preprocessing for sentiment analysis."""
    
    def __init__(
        self,
        lowercase: bool = True,
        remove_punctuation: bool = True,
        remove_numbers: bool = False,
        remove_stopwords: bool = False,
        remove_urls: bool = True,
        remove_emails: bool = True,
        remove_mentions: bool = True,
        remove_hashtags: bool = False,
        expand_contractions: bool = True,
        lemmatize: bool = False,
        stem: bool = False,
        min_word_length: int = 1,
        max_word_length: int = 50
    ):
        self.lowercase = lowercase
        self.remove_punctuation = remove_punctuation
        self.remove_numbers = remove_numbers
        self.remove_stopwords = remove_stopwords
        self.remove_urls = remove_urls
        self.remove_emails = remove_emails
        self.remove_mentions = remove_mentions
        self.remove_hashtags = remove_hashtags
        self.expand_contractions = expand_contractions
        self.lemmatize = lemmatize
        self.stem = stem
        self.min_word_length = min_word_length
        self.max_word_length = max_word_length
        
        # Initialize NLTK components if available
        self.stop_words = set()
        self.stemmer = None
        self.lemmatizer = None
        
        if NLTK_AVAILABLE:
            try:
                nltk.data.find('tokenizers/punkt')
            except LookupError:
                print("Downloading NLTK punkt tokenizer...")
                nltk.download('punkt', quiet=True)
            
            try:
                nltk.data.find('corpora/stopwords')
            except LookupError:
                print("Downloading NLTK stopwords...")
                nltk.download('stopwords', quiet=True)
            
            try:
                nltk.data.find('corpora/wordnet')
            except LookupError:
                print("Downloading NLTK wordnet...")
                nltk.download('wordnet', quiet=True)
            
            self.stop_words = set(stopwords.words('english'))
            self.stemmer = PorterStemmer()
            self.lemmatizer = WordNetLemmatizer()
        
        # Common contractions
        self.contractions = {
            "won't": "will not", "can't": "cannot", "n't": " not",
            "'re": " are", "'s": " is", "'d": " would", "'ll": " will",
            "'ve": " have", "'m": " am", "let's": "let us",
            "i'm": "i am", "you're": "you are", "he's": "he is",
            "she's": "she is", "it's": "it is", "we're": "we are",
            "they're": "they are", "i've": "i have", "you've": "you have",
            "we've": "we have", "they've": "they have", "i'd": "i would",
            "you'd": "you would", "he'd": "he would", "she'd": "she would",
            "we'd": "we would", "they'd": "they would", "i'll": "i will",
            "you'll": "you will", "he'll": "he will", "she'll": "she will",
            "we'll": "we will", "they'll": "they will", "isn't": "is not",
            "aren't": "are not", "wasn't": "was not", "weren't": "were not",
            "hasn't": "has not", "haven't": "have not", "hadn't": "had not",
            "doesn't": "does not", "don't": "do not", "didn't": "did not",
            "wouldn't": "would not", "shouldn't": "should not",
            "couldn't": "could not", "mustn't": "must not"
        }
    
    def clean_text(self, text: str) -> str:
        """Apply all cleaning steps to text."""
        if not text:
            return ""
        
        # Convert to string if not already
        text = str(text)
        
        # Remove URLs
        if self.remove_urls:
            text = re.sub(r'https?://\S+|www\.\S+', '', text)
        
        # Remove emails
        if self.remove_emails:
            text = re.sub(r'\S+@\S+', '', text)
        
        # Remove mentions (@username)
        if self.remove_mentions:
            text = re.sub(r'@\w+', '', text)
        
        # Remove hashtags (keep the word, remove #)
        if self.remove_hashtags:
            text = re.sub(r'#\w+', '', text)
        else:
            text = re.sub(r'#(\w+)', r'\1', text)
        
        # Expand contractions
        if self.expand_contractions:
            for contraction, expansion in self.contractions.items():
                text = re.sub(re.escape(contraction), expansion, text, flags=re.IGNORECASE)
        
        # Lowercase
        if self.lowercase:
            text = text.lower()
        
        # Remove numbers
        if self.remove_numbers:
            text = re.sub(r'\d+', '', text)
        
        # Remove punctuation
        if self.remove_punctuation:
            text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        return text
    
    def tokenize(self, text: str) -> List[str]:
        """Tokenize text into words."""
        if NLTK_AVAILABLE:
            try:
                tokens = word_tokenize(text)
            except:
                tokens = text.split()
        else:
            tokens = text.split()
        
        # Filter by word length
        tokens = [t for t in tokens if self.min_word_length <= len(t) <= self.max_word_length]
        
        # Remove stopwords
        if self.remove_stopwords and self.stop_words:
            tokens = [t for t in tokens if t.lower() not in self.stop_words]
        
        # Stemming
        if self.stem and self.stemmer:
            tokens = [self.stemmer.stem(t) for t in tokens]
        
        # Lemmatization
        if self.lemmatize and self.lemmatizer:
            tokens = [self.lemmatizer.lemmatize(t) for t in tokens]
        
        return tokens
    
    def process(self, text: str) -> Dict:
        """Process text and return cleaned version with metadata."""
        original_text = text
        cleaned_text = self.clean_text(text)
        tokens = self.tokenize(cleaned_text)
        
        return {
            "original": original_text,
            "cleaned": cleaned_text,
            "tokens": tokens,
            "word_count": len(tokens),
            "char_count": len(cleaned_text),
            "avg_word_length": sum(len(t) for t in tokens) / len(tokens) if tokens else 0
        }


# ============================================
# Feature Extraction
# ============================================

class FeatureExtractor:
    """Extract features from preprocessed text."""
    
    def __init__(self):
        # Sentiment lexicons (simplified)
        self.positive_words = {
            'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic',
            'awesome', 'best', 'love', 'perfect', 'happy', 'beautiful', 'nice',
            'brilliant', 'outstanding', 'superb', 'incredible', 'impressive',
            'recommend', 'satisfied', 'delighted', 'pleased', 'exceptional'
        }
        
        self.negative_words = {
            'bad', 'terrible', 'awful', 'horrible', 'worst', 'hate', 'poor',
            'disappointing', 'disappointed', 'broken', 'defective', 'useless',
            'waste', 'garbage', 'scam', 'fraud', 'regret', 'avoid', 'never',
            'unhappy', 'frustrated', 'angry', 'annoyed', 'unacceptable'
        }
        
        self.negation_words = {
            'not', 'no', 'never', 'neither', 'nobody', 'nothing', 'nowhere',
            'hardly', 'scarcely', 'barely', "n't", 'cannot', "can't", "won't"
        }
        
        self.intensifiers = {
            'very', 'really', 'extremely', 'absolutely', 'completely', 'totally',
            'highly', 'incredibly', 'remarkably', 'exceptionally', 'super'
        }
    
    def extract_features(self, tokens: List[str]) -> Dict:
        """Extract sentiment-related features from tokens."""
        tokens_lower = [t.lower() for t in tokens]
        
        positive_count = sum(1 for t in tokens_lower if t in self.positive_words)
        negative_count = sum(1 for t in tokens_lower if t in self.negative_words)
        negation_count = sum(1 for t in tokens_lower if t in self.negation_words)
        intensifier_count = sum(1 for t in tokens_lower if t in self.intensifiers)
        
        # Calculate sentiment score
        sentiment_score = (positive_count - negative_count) / (len(tokens) + 1)
        
        # Adjust for negations
        if negation_count > 0:
            sentiment_score *= -0.5
        
        return {
            "positive_word_count": positive_count,
            "negative_word_count": negative_count,
            "negation_count": negation_count,
            "intensifier_count": intensifier_count,
            "sentiment_score": round(sentiment_score, 4),
            "exclamation_count": sum(1 for t in tokens if '!' in t),
            "question_count": sum(1 for t in tokens if '?' in t),
            "caps_ratio": sum(1 for t in tokens if t.isupper()) / (len(tokens) + 1)
        }


# ============================================
# Data Loading and Saving
# ============================================

def load_data(filepath: str) -> List[Dict]:
    """Load data from CSV or JSON file."""
    ext = os.path.splitext(filepath)[1].lower()
    
    if ext == '.csv':
        data = []
        with open(filepath, 'r', encoding='utf-8') as f:
            # Skip comment lines
            lines = [line for line in f if not line.strip().startswith('#')]
            reader = csv.DictReader(lines)
            for row in reader:
                data.append(row)
        return data
    
    elif ext == '.json':
        with open(filepath, 'r', encoding='utf-8') as f:
            content = json.load(f)
            if isinstance(content, dict) and 'data' in content:
                return content['data']
            return content
    
    else:
        raise ValueError(f"Unsupported file format: {ext}")


def save_preprocessed_csv(data: List[Dict], filepath: str):
    """Save preprocessed data to CSV."""
    os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
    
    with open(filepath, 'w', newline='', encoding='utf-8') as f:
        f.write("""# ================================================================================
# Sentiment Analysis Dataset - Preprocessed Data
# 
# Project: Sentiment Analysis Dataset
# Preprocessed by: RSK World Text Preprocessor
# Website: https://rskworld.in
# 
# Author: Molla Samser (Founder)
# Designer & Tester: Rima Khatun
# Email: help@rskworld.in | support@rskworld.in
# 
# © 2026 RSK World - Free Programming Resources & Source Code
# ================================================================================

""")
        
        if data:
            fieldnames = list(data[0].keys())
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(data)
    
    print(f"✓ Saved preprocessed data to {filepath}")


def save_preprocessed_json(data: List[Dict], filepath: str, metadata: Dict = None):
    """Save preprocessed data to JSON."""
    os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
    
    output = {
        "_metadata": {
            "project": "Sentiment Analysis Dataset",
            "description": "Preprocessed sentiment analysis data",
            "preprocessor": "RSK World Text Preprocessor",
            "website": "https://rskworld.in",
            "author": "Molla Samser (Founder)",
            "designer_tester": "Rima Khatun",
            "email": "help@rskworld.in | support@rskworld.in",
            "copyright": "© 2026 RSK World - Free Programming Resources & Source Code",
            "processed_at": datetime.now().isoformat(),
            "total_samples": len(data),
            **(metadata or {})
        },
        "data": data
    }
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    
    print(f"✓ Saved preprocessed data to {filepath}")


# ============================================
# Main Processing Pipeline
# ============================================

def process_dataset(
    input_path: str,
    output_dir: str,
    preprocessor: TextPreprocessor,
    feature_extractor: FeatureExtractor,
    extract_features: bool = True
) -> List[Dict]:
    """Process entire dataset."""
    
    print(f"Loading data from {input_path}...")
    data = load_data(input_path)
    print(f"Loaded {len(data)} samples")
    
    processed_data = []
    cleaned_data = []
    tokenized_data = []
    
    print("Processing...")
    for i, sample in enumerate(data):
        text = sample.get('text', '')
        result = preprocessor.process(text)
        
        # Cleaned data entry
        cleaned_entry = {
            "id": sample.get('id', i + 1),
            "cleaned_text": result['cleaned'],
            "sentiment": sample.get('sentiment', ''),
            "word_count": result['word_count']
        }
        
        # Tokenized data entry
        tokenized_entry = {
            "id": sample.get('id', i + 1),
            "tokens": result['tokens'],
            "sentiment": sample.get('sentiment', ''),
            "label": {"positive": 2, "neutral": 1, "negative": 0}.get(sample.get('sentiment', ''), 1)
        }
        
        # Add features if requested
        if extract_features:
            features = feature_extractor.extract_features(result['tokens'])
            cleaned_entry.update(features)
            tokenized_entry['features'] = features
        
        cleaned_data.append(cleaned_entry)
        tokenized_data.append(tokenized_entry)
        
        # Progress
        if (i + 1) % 500 == 0:
            print(f"  Processed {i + 1}/{len(data)} samples...")
    
    # Save outputs
    os.makedirs(output_dir, exist_ok=True)
    
    cleaned_csv_path = os.path.join(output_dir, 'cleaned_data.csv')
    tokenized_json_path = os.path.join(output_dir, 'tokenized_data.json')
    
    save_preprocessed_csv(cleaned_data, cleaned_csv_path)
    save_preprocessed_json(tokenized_data, tokenized_json_path, {
        "preprocessing": {
            "lowercase": preprocessor.lowercase,
            "remove_punctuation": preprocessor.remove_punctuation,
            "remove_stopwords": preprocessor.remove_stopwords,
            "lemmatize": preprocessor.lemmatize,
            "stem": preprocessor.stem
        },
        "label_mapping": {"0": "negative", "1": "neutral", "2": "positive"}
    })
    
    return cleaned_data, tokenized_data


# ============================================
# Vocabulary Builder
# ============================================

def build_vocabulary(tokenized_data: List[Dict], min_freq: int = 2) -> Dict:
    """Build vocabulary from tokenized data."""
    word_counts = Counter()
    
    for sample in tokenized_data:
        tokens = sample.get('tokens', [])
        word_counts.update(tokens)
    
    # Filter by minimum frequency
    vocab = {word: count for word, count in word_counts.items() if count >= min_freq}
    
    # Create word to index mapping
    word2idx = {"<PAD>": 0, "<UNK>": 1}
    for word in sorted(vocab.keys()):
        word2idx[word] = len(word2idx)
    
    return {
        "vocabulary_size": len(word2idx),
        "word_counts": dict(word_counts.most_common(1000)),
        "word2idx": word2idx
    }


# ============================================
# Main Function
# ============================================

def main():
    parser = argparse.ArgumentParser(
        description="Preprocess sentiment analysis data - RSK World",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python preprocess_data.py --input ../data/sentiment_data.csv
  python preprocess_data.py --input ../data/sentiment_data.json --lowercase --remove-stopwords
  python preprocess_data.py --input ../data/sentiment_data.csv --lemmatize --extract-features

Author: Molla Samser (Founder) - RSK World
Website: https://rskworld.in
        """
    )
    
    parser.add_argument(
        "--input", "-i",
        type=str,
        required=True,
        help="Input file path (CSV or JSON)"
    )
    
    parser.add_argument(
        "--output", "-o",
        type=str,
        default="./preprocessed",
        help="Output directory (default: ./preprocessed)"
    )
    
    parser.add_argument(
        "--lowercase", "-l",
        action="store_true",
        default=True,
        help="Convert text to lowercase (default: True)"
    )
    
    parser.add_argument(
        "--remove-stopwords", "-s",
        action="store_true",
        help="Remove stopwords"
    )
    
    parser.add_argument(
        "--remove-punctuation", "-p",
        action="store_true",
        default=True,
        help="Remove punctuation (default: True)"
    )
    
    parser.add_argument(
        "--lemmatize",
        action="store_true",
        help="Apply lemmatization"
    )
    
    parser.add_argument(
        "--stem",
        action="store_true",
        help="Apply stemming"
    )
    
    parser.add_argument(
        "--extract-features", "-f",
        action="store_true",
        default=True,
        help="Extract sentiment features (default: True)"
    )
    
    parser.add_argument(
        "--build-vocab", "-v",
        action="store_true",
        help="Build vocabulary file"
    )
    
    parser.add_argument(
        "--min-freq",
        type=int,
        default=2,
        help="Minimum word frequency for vocabulary (default: 2)"
    )
    
    args = parser.parse_args()
    
    print("""
╔══════════════════════════════════════════════════════════════════╗
║         RSK World - Sentiment Analysis Data Preprocessor         ║
║                                                                  ║
║  Author: Molla Samser (Founder)                                  ║
║  Website: https://rskworld.in                                    ║
║  © 2026 RSK World - Free Programming Resources & Source Code     ║
╚══════════════════════════════════════════════════════════════════╝
    """)
    
    # Check for NLTK
    if not NLTK_AVAILABLE:
        print("⚠ NLTK not installed. Some features may be limited.")
        print("  Install with: pip install nltk")
        print()
    
    # Initialize preprocessor
    preprocessor = TextPreprocessor(
        lowercase=args.lowercase,
        remove_punctuation=args.remove_punctuation,
        remove_stopwords=args.remove_stopwords,
        lemmatize=args.lemmatize,
        stem=args.stem
    )
    
    feature_extractor = FeatureExtractor()
    
    print(f"Input: {args.input}")
    print(f"Output: {args.output}")
    print(f"Settings:")
    print(f"  - Lowercase: {args.lowercase}")
    print(f"  - Remove punctuation: {args.remove_punctuation}")
    print(f"  - Remove stopwords: {args.remove_stopwords}")
    print(f"  - Lemmatize: {args.lemmatize}")
    print(f"  - Stem: {args.stem}")
    print(f"  - Extract features: {args.extract_features}")
    print()
    
    # Process dataset
    cleaned_data, tokenized_data = process_dataset(
        args.input,
        args.output,
        preprocessor,
        feature_extractor,
        args.extract_features
    )
    
    # Build vocabulary if requested
    if args.build_vocab:
        print()
        print("Building vocabulary...")
        vocab = build_vocabulary(tokenized_data, args.min_freq)
        
        vocab_path = os.path.join(args.output, 'vocabulary.json')
        with open(vocab_path, 'w', encoding='utf-8') as f:
            json.dump(vocab, f, indent=2)
        print(f"✓ Saved vocabulary to {vocab_path}")
        print(f"  Vocabulary size: {vocab['vocabulary_size']}")
    
    print()
    print("✓ Preprocessing complete!")
    print()


if __name__ == "__main__":
    main()

635 lines•21.9 KB
python

About RSK World

Founded by Molla Samser, with Designer & Tester Rima Khatun, RSK World is your one-stop destination for free programming resources, source code, and development tools.

Founder: Molla Samser
Designer & Tester: Rima Khatun

Development

  • Game Development
  • Web Development
  • Mobile Development
  • AI Development
  • Development Tools

Legal

  • Terms & Conditions
  • Privacy Policy
  • Disclaimer

Contact Info

Nutanhat, Mongolkote
Purba Burdwan, West Bengal
India, 713147

+91 93305 39277

hello@rskworld.in
support@rskworld.in

© 2026 RSK World. All rights reserved.

Content used for educational purposes only. View Disclaimer