help@rskworld.in +91 93305 39277
RSK World
  • Home
  • Development
    • Web Development
    • Mobile Apps
    • Software
    • Games
    • Project
  • Technologies
    • Data Science
    • AI Development
    • Cloud Development
    • Blockchain
    • Cyber Security
    • Dev Tools
    • Testing Tools
  • About
  • Contact

Theme Settings

Color Scheme
Display Options
Font Size
100%
Back to Project
RSK World
language-translation
/
scripts
RSK World
language-translation
Language Translation Dataset - Machine Translation + Multilingual NLP + Parallel Corpus + Transformers
scripts
  • __pycache__
  • analyze_dataset.py4.6 KB
  • build_local_dictionary.py6.7 KB
  • convert_format.py3.6 KB
  • create_zip.py4.1 KB
  • download_translation_data.py17.6 KB
  • process_data.py3.9 KB
train_model.pytranscripts.jsonprocess_data.py
scripts/process_data.py
Raw Download
Find: Go to:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Language Translation Dataset - Data Processing Script
Author: RSK World
Website: https://rskworld.in
Email: help@rskworld.in
Phone: +91 93305 39277
Copyright © 2016 RSK World. All rights reserved.

This script processes the language translation dataset for machine learning tasks.
"""

import pandas as pd
import json
import os
from pathlib import Path

# Configuration
BASE_DIR = Path(__file__).parent.parent
DATA_DIR = BASE_DIR / 'data'

def load_tsv_data(filepath):
    """
    Load TSV data file.
    
    Args:
        filepath: Path to TSV file
        
    Returns:
        pandas.DataFrame: Loaded data
    """
    print(f"Loading TSV data from {filepath}...")
    df = pd.read_csv(filepath, sep='\t', encoding='utf-8')
    print(f"Loaded {len(df)} rows")
    return df

def load_json_data(filepath):
    """
    Load JSON data file.
    
    Args:
        filepath: Path to JSON file
        
    Returns:
        list: Loaded data
    """
    print(f"Loading JSON data from {filepath}...")
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"Loaded {len(data)} records")
    return data

def convert_tsv_to_json(tsv_path, json_path):
    """
    Convert TSV file to JSON format.
    
    Args:
        tsv_path: Path to TSV file
        json_path: Path to output JSON file
    """
    print(f"Converting {tsv_path} to {json_path}...")
    df = load_tsv_data(tsv_path)
    data = df.to_dict('records')
    
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    
    print(f"Conversion complete. Saved to {json_path}")

def convert_json_to_tsv(json_path, tsv_path):
    """
    Convert JSON file to TSV format.
    
    Args:
        json_path: Path to JSON file
        tsv_path: Path to output TSV file
    """
    print(f"Converting {json_path} to {tsv_path}...")
    data = load_json_data(json_path)
    df = pd.DataFrame(data)
    
    df.to_csv(tsv_path, sep='\t', index=False, encoding='utf-8')
    print(f"Conversion complete. Saved to {tsv_path}")

def get_dataset_statistics(filepath):
    """
    Get statistics about the dataset.
    
    Args:
        filepath: Path to data file (TSV or JSON)
    """
    print(f"\n=== Dataset Statistics: {filepath} ===")
    
    if filepath.suffix == '.tsv':
        df = load_tsv_data(filepath)
        print(f"Total rows: {len(df)}")
        print(f"Total columns: {len(df.columns)}")
        print(f"\nColumns: {', '.join(df.columns)}")
        print(f"\nFirst few rows:")
        print(df.head())
        
    elif filepath.suffix == '.json':
        data = load_json_data(filepath)
        print(f"Total records: {len(data)}")
        if data:
            print(f"Keys: {', '.join(data[0].keys())}")
            print(f"\nFirst record:")
            print(json.dumps(data[0], indent=2, ensure_ascii=False))

def main():
    """Main processing function."""
    print("=" * 60)
    print("Language Translation Dataset - Data Processing")
    print("Author: RSK World (https://rskworld.in)")
    print("=" * 60)
    
    # Ensure data directory exists
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    
    # Process training data
    train_tsv = DATA_DIR / 'train.tsv'
    train_json = DATA_DIR / 'train.json'
    
    if train_tsv.exists():
        get_dataset_statistics(train_tsv)
        if not train_json.exists():
            convert_tsv_to_json(train_tsv, train_json)
    
    # Process validation data
    val_tsv = DATA_DIR / 'validation.tsv'
    val_json = DATA_DIR / 'validation.json'
    
    if val_tsv.exists():
        get_dataset_statistics(val_tsv)
        if not val_json.exists():
            convert_tsv_to_json(val_tsv, val_json)
    
    print("\n" + "=" * 60)
    print("Processing complete!")
    print("=" * 60)

if __name__ == '__main__':
    main()

146 lines•3.9 KB
python

About RSK World

Founded by Molla Samser, with Designer & Tester Rima Khatun, RSK World is your one-stop destination for free programming resources, source code, and development tools.

Founder: Molla Samser
Designer & Tester: Rima Khatun

Development

  • Game Development
  • Web Development
  • Mobile Development
  • AI Development
  • Development Tools

Legal

  • Terms & Conditions
  • Privacy Policy
  • Disclaimer

Contact Info

Nutanhat, Mongolkote
Purba Burdwan, West Bengal
India, 713147

+91 93305 39277

hello@rskworld.in
support@rskworld.in

© 2026 RSK World. All rights reserved.

Content used for educational purposes only. View Disclaimer