RSK World - Language Translation Dataset - Project Files Browser | RSK World

.gitignore .gitkeep download_translation_data.py

.gitignore

# Language Translation Dataset - Git Ignore
# Author: RSK World
# Website: https://rskworld.in
# Email: help@rskworld.in
# Phone: +91 93305 39277
# Copyright © 2016 RSK World. All rights reserved.

# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
env/
venv/
ENV/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# IDE
.vscode/
.idea/
*.swp
*.swo
*~

# OS
.DS_Store
Thumbs.db

# Data (uncomment if you don't want to track data files)
# data/*.tsv
# data/*.json

# Large downloaded files (exceed GitHub 100MB limit)
data/tatoeba_*.csv
data/opus_*.zip
data/*.zip

# Models
models/
*.pth
*.pt
*.h5
*.ckpt

# Logs
*.log
logs/

64 lines•705 B

text

scripts/download_translation_data.py

Raw Download

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Language Translation Dataset - Download Translation Data Script
Author: RSK World
Website: https://rskworld.in
Email: help@rskworld.in
Phone: +91 93305 39277
Copyright © 2016 RSK World. All rights reserved.

This script downloads translation data from publicly available sources and adds it to the local dataset.
"""

import json
import requests
import csv
import os
import time
from pathlib import Path
from urllib.parse import urlparse
import zipfile
import io

# Configuration
BASE_DIR = Path(__file__).parent.parent
DATA_DIR = BASE_DIR / 'data'
SCRIPTS_DIR = BASE_DIR / 'scripts'

# Ensure data directory exists
DATA_DIR.mkdir(parents=True, exist_ok=True)

def download_file(url, filename):
    """Download a file from URL."""
    print(f"Downloading {filename} from {url}...")
    try:
        response = requests.get(url, stream=True, timeout=30)
        response.raise_for_status()
        
        filepath = DATA_DIR / filename
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        print(f"[OK] Downloaded {filename} successfully")
        return filepath
    except Exception as e:
        print(f"[ERROR] Error downloading {filename}: {e}")
        return None

def download_tatoeba_data():
    """Download data from Tatoeba (public translation corpus)."""
    print("\n" + "="*60)
    print("Downloading Tatoeba Translation Data")
    print("="*60)
    
    # Tatoeba sentences and links (sample data)
    tatoeba_urls = {
        'sentences': 'https://downloads.tatoeba.org/exports/sentences.csv',
        'links': 'https://downloads.tatoeba.org/exports/links.csv'
    }
    
    downloaded_files = {}
    for key, url in tatoeba_urls.items():
        filename = f"tatoeba_{key}.csv"
        filepath = download_file(url, filename)
        if filepath:
            downloaded_files[key] = filepath
    
    return downloaded_files

def download_opus_data():
    """Download data from OPUS (Open Parallel Corpus)."""
    print("\n" + "="*60)
    print("Downloading OPUS Translation Data")
    print("="*60)
    
    # OPUS provides various parallel corpora
    # Using a smaller sample dataset for demonstration
    opus_urls = {
        'open_subtitles': 'https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/en-es.txt.zip',
        'europarl': 'https://object.pouta.csc.fi/OPUS-Europarl/v8/moses/en-es.txt.zip'
    }
    
    downloaded_files = {}
    for key, url in opus_urls.items():
        filename = f"opus_{key}.zip"
        filepath = download_file(url, filename)
        if filepath:
            downloaded_files[key] = filepath
    
    return downloaded_files

def process_tatoeba_data(tatoeba_files):
    """Process Tatoeba data and extract translations."""
    print("\n" + "="*60)
    print("Processing Tatoeba Data")
    print("="*60)
    
    translations = []
    
    if 'sentences' in tatoeba_files and 'links' in tatoeba_files:
        try:
            # Read sentences
            sentences = {}
            with open(tatoeba_files['sentences'], 'r', encoding='utf-8') as f:
                reader = csv.reader(f, delimiter='\t')
                for row in reader:
                    if len(row) >= 3:
                        sentence_id = row[0]
                        lang = row[1]
                        text = row[2]
                        if lang in ['eng', 'spa', 'fra', 'deu']:
                            lang_map = {'eng': 'en', 'spa': 'es', 'fra': 'fr', 'deu': 'de'}
                            sentences[sentence_id] = {'lang': lang_map[lang], 'text': text}
            
            # Read links to find translation pairs
            with open(tatoeba_files['links'], 'r', encoding='utf-8') as f:
                reader = csv.reader(f, delimiter='\t')
                for row in reader:
                    if len(row) >= 2:
                        id1, id2 = row[0], row[1]
                        if id1 in sentences and id2 in sentences:
                            s1 = sentences[id1]
                            s2 = sentences[id2]
                            
                            if s1['lang'] != s2['lang']:
                                trans = {
                                    'id': len(translations) + 1,
                                    s1['lang']: s1['text'],
                                    s2['lang']: s2['text']
                                }
                                translations.append(trans)
            
            print(f"[OK] Processed {len(translations)} translation pairs from Tatoeba")
        except Exception as e:
            print(f"[ERROR] Error processing Tatoeba data: {e}")
    
    return translations

def process_opus_data(opus_files):
    """Process OPUS data and extract translations."""
    print("\n" + "="*60)
    print("Processing OPUS Data")
    print("="*60)
    
    translations = []
    
    for key, filepath in opus_files.items():
        try:
            if filepath.suffix == '.zip':
                with zipfile.ZipFile(filepath, 'r') as zip_ref:
                    # Extract and process files
                    for file_info in zip_ref.namelist():
                        if file_info.endswith('.txt'):
                            with zip_ref.open(file_info) as f:
                                content = f.read().decode('utf-8')
                                lines = content.strip().split('\n')
                                
                                # OPUS files typically have parallel sentences
                                for i, line in enumerate(lines[:1000]):  # Limit to 1000 for demo
                                    if line.strip():
                                        trans = {
                                            'id': len(translations) + 1,
                                            'en': line.strip(),
                                            'es': line.strip()  # Placeholder - would need parallel file
                                        }
                                        translations.append(trans)
            
            print(f"[OK] Processed data from {key}")
        except Exception as e:
            print(f"[ERROR] Error processing OPUS data {key}: {e}")
    
    return translations

def download_common_words():
    """Download and create common words dictionary."""
    print("\n" + "="*60)
    print("Creating Common Words Dictionary")
    print("="*60)
    
    # Common words and phrases with translations
    common_words = [
        # Greetings
        {"en": "Hello", "es": "Hola", "fr": "Bonjour", "de": "Hallo"},
        {"en": "Good morning", "es": "Buenos días", "fr": "Bonjour", "de": "Guten Morgen"},
        {"en": "Good afternoon", "es": "Buenas tardes", "fr": "Bon après-midi", "de": "Guten Tag"},
        {"en": "Good evening", "es": "Buenas noches", "fr": "Bonsoir", "de": "Guten Abend"},
        {"en": "Good night", "es": "Buenas noches", "fr": "Bonne nuit", "de": "Gute Nacht"},
        {"en": "How are you?", "es": "¿Cómo estás?", "fr": "Comment allez-vous?", "de": "Wie geht es dir?"},
        {"en": "Thank you", "es": "Gracias", "fr": "Merci", "de": "Danke"},
        {"en": "Please", "es": "Por favor", "fr": "S'il vous plaît", "de": "Bitte"},
        {"en": "You're welcome", "es": "De nada", "fr": "De rien", "de": "Bitte schön"},
        {"en": "Excuse me", "es": "Disculpe", "fr": "Excusez-moi", "de": "Entschuldigung"},
        
        # Common phrases
        {"en": "Yes", "es": "Sí", "fr": "Oui", "de": "Ja"},
        {"en": "No", "es": "No", "fr": "Non", "de": "Nein"},
        {"en": "Maybe", "es": "Tal vez", "fr": "Peut-être", "de": "Vielleicht"},
        {"en": "I don't know", "es": "No sé", "fr": "Je ne sais pas", "de": "Ich weiß nicht"},
        {"en": "I understand", "es": "Entiendo", "fr": "Je comprends", "de": "Ich verstehe"},
        {"en": "I don't understand", "es": "No entiendo", "fr": "Je ne comprends pas", "de": "Ich verstehe nicht"},
        {"en": "Can you help me?", "es": "¿Puedes ayudarme?", "fr": "Pouvez-vous m'aider?", "de": "Können Sie mir helfen?"},
        {"en": "Where is...?", "es": "¿Dónde está...?", "fr": "Où est...?", "de": "Wo ist...?"},
        {"en": "How much?", "es": "¿Cuánto?", "fr": "Combien?", "de": "Wie viel?"},
        {"en": "What time is it?", "es": "¿Qué hora es?", "fr": "Quelle heure est-il?", "de": "Wie spät ist es?"},
        
        # Numbers
        {"en": "One", "es": "Uno", "fr": "Un", "de": "Eins"},
        {"en": "Two", "es": "Dos", "fr": "Deux", "de": "Zwei"},
        {"en": "Three", "es": "Tres", "fr": "Trois", "de": "Drei"},
        {"en": "Four", "es": "Cuatro", "fr": "Quatre", "de": "Vier"},
        {"en": "Five", "es": "Cinco", "fr": "Cinq", "de": "Fünf"},
        {"en": "Ten", "es": "Diez", "fr": "Dix", "de": "Zehn"},
        {"en": "Twenty", "es": "Veinte", "fr": "Vingt", "de": "Zwanzig"},
        {"en": "One hundred", "es": "Cien", "fr": "Cent", "de": "Hundert"},
        
        # Days of week
        {"en": "Monday", "es": "Lunes", "fr": "Lundi", "de": "Montag"},
        {"en": "Tuesday", "es": "Martes", "fr": "Mardi", "de": "Dienstag"},
        {"en": "Wednesday", "es": "Miércoles", "fr": "Mercredi", "de": "Mittwoch"},
        {"en": "Thursday", "es": "Jueves", "fr": "Jeudi", "de": "Donnerstag"},
        {"en": "Friday", "es": "Viernes", "fr": "Vendredi", "de": "Freitag"},
        {"en": "Saturday", "es": "Sábado", "fr": "Samedi", "de": "Samstag"},
        {"en": "Sunday", "es": "Domingo", "fr": "Dimanche", "de": "Sonntag"},
        
        # Months
        {"en": "January", "es": "Enero", "fr": "Janvier", "de": "Januar"},
        {"en": "February", "es": "Febrero", "fr": "Février", "de": "Februar"},
        {"en": "March", "es": "Marzo", "fr": "Mars", "de": "März"},
        {"en": "April", "es": "Abril", "fr": "Avril", "de": "April"},
        {"en": "May", "es": "Mayo", "fr": "Mai", "de": "Mai"},
        {"en": "June", "es": "Junio", "fr": "Juin", "de": "Juni"},
        
        # Common verbs
        {"en": "To be", "es": "Ser/Estar", "fr": "Être", "de": "Sein"},
        {"en": "To have", "es": "Tener", "fr": "Avoir", "de": "Haben"},
        {"en": "To go", "es": "Ir", "fr": "Aller", "de": "Gehen"},
        {"en": "To come", "es": "Venir", "fr": "Venir", "de": "Kommen"},
        {"en": "To do", "es": "Hacer", "fr": "Faire", "de": "Tun"},
        {"en": "To say", "es": "Decir", "fr": "Dire", "de": "Sagen"},
        {"en": "To see", "es": "Ver", "fr": "Voir", "de": "Sehen"},
        {"en": "To know", "es": "Saber", "fr": "Savoir", "de": "Wissen"},
        {"en": "To want", "es": "Querer", "fr": "Vouloir", "de": "Wollen"},
        {"en": "To need", "es": "Necesitar", "fr": "Avoir besoin", "de": "Brauchen"},
        
        # Food
        {"en": "Water", "es": "Agua", "fr": "Eau", "de": "Wasser"},
        {"en": "Food", "es": "Comida", "fr": "Nourriture", "de": "Essen"},
        {"en": "Bread", "es": "Pan", "fr": "Pain", "de": "Brot"},
        {"en": "Milk", "es": "Leche", "fr": "Lait", "de": "Milch"},
        {"en": "Coffee", "es": "Café", "fr": "Café", "de": "Kaffee"},
        {"en": "Tea", "es": "Té", "fr": "Thé", "de": "Tee"},
        {"en": "Meat", "es": "Carne", "fr": "Viande", "de": "Fleisch"},
        {"en": "Fish", "es": "Pescado", "fr": "Poisson", "de": "Fisch"},
        {"en": "Fruit", "es": "Fruta", "fr": "Fruit", "de": "Obst"},
        {"en": "Vegetable", "es": "Verdura", "fr": "Légume", "de": "Gemüse"},
        
        # Family
        {"en": "Family", "es": "Familia", "fr": "Famille", "de": "Familie"},
        {"en": "Father", "es": "Padre", "fr": "Père", "de": "Vater"},
        {"en": "Mother", "es": "Madre", "fr": "Mère", "de": "Mutter"},
        {"en": "Brother", "es": "Hermano", "fr": "Frère", "de": "Bruder"},
        {"en": "Sister", "es": "Hermana", "fr": "Sœur", "de": "Schwester"},
        {"en": "Son", "es": "Hijo", "fr": "Fils", "de": "Sohn"},
        {"en": "Daughter", "es": "Hija", "fr": "Fille", "de": "Tochter"},
        
        # Colors
        {"en": "Red", "es": "Rojo", "fr": "Rouge", "de": "Rot"},
        {"en": "Blue", "es": "Azul", "fr": "Bleu", "de": "Blau"},
        {"en": "Green", "es": "Verde", "fr": "Vert", "de": "Grün"},
        {"en": "Yellow", "es": "Amarillo", "fr": "Jaune", "de": "Gelb"},
        {"en": "Black", "es": "Negro", "fr": "Noir", "de": "Schwarz"},
        {"en": "White", "es": "Blanco", "fr": "Blanc", "de": "Weiß"},
        
        # Time
        {"en": "Today", "es": "Hoy", "fr": "Aujourd'hui", "de": "Heute"},
        {"en": "Tomorrow", "es": "Mañana", "fr": "Demain", "de": "Morgen"},
        {"en": "Yesterday", "es": "Ayer", "fr": "Hier", "de": "Gestern"},
        {"en": "Now", "es": "Ahora", "fr": "Maintenant", "de": "Jetzt"},
        {"en": "Later", "es": "Más tarde", "fr": "Plus tard", "de": "Später"},
        
        # Places
        {"en": "Home", "es": "Casa", "fr": "Maison", "de": "Zuhause"},
        {"en": "School", "es": "Escuela", "fr": "École", "de": "Schule"},
        {"en": "Hospital", "es": "Hospital", "fr": "Hôpital", "de": "Krankenhaus"},
        {"en": "Restaurant", "es": "Restaurante", "fr": "Restaurant", "de": "Restaurant"},
        {"en": "Hotel", "es": "Hotel", "fr": "Hôtel", "de": "Hotel"},
        {"en": "Airport", "es": "Aeropuerto", "fr": "Aéroport", "de": "Flughafen"},
        {"en": "Station", "es": "Estación", "fr": "Gare", "de": "Bahnhof"},
        
        # Actions
        {"en": "To eat", "es": "Comer", "fr": "Manger", "de": "Essen"},
        {"en": "To drink", "es": "Beber", "fr": "Boire", "de": "Trinken"},
        {"en": "To sleep", "es": "Dormir", "fr": "Dormir", "de": "Schlafen"},
        {"en": "To work", "es": "Trabajar", "fr": "Travailler", "de": "Arbeiten"},
        {"en": "To study", "es": "Estudiar", "fr": "Étudier", "de": "Studieren"},
        {"en": "To read", "es": "Leer", "fr": "Lire", "de": "Lesen"},
        {"en": "To write", "es": "Escribir", "fr": "Écrire", "de": "Schreiben"},
        {"en": "To speak", "es": "Hablar", "fr": "Parler", "de": "Sprechen"},
        {"en": "To listen", "es": "Escuchar", "fr": "Écouter", "de": "Zuhören"},
        {"en": "To learn", "es": "Aprender", "fr": "Apprendre", "de": "Lernen"},
    ]
    
    # Convert to format with IDs
    translations = []
    for i, word in enumerate(common_words, 1):
        trans = {
            'id': i,
            'english': word.get('en', ''),
            'spanish': word.get('es', ''),
            'french': word.get('fr', ''),
            'german': word.get('de', '')
        }
        translations.append(trans)
    
    print(f"[OK] Created {len(translations)} common words translations")
    return translations

def merge_translations(existing_file, new_translations):
    """Merge new translations with existing dataset."""
    print("\n" + "="*60)
    print("Merging Translations")
    print("="*60)
    
    # Load existing data
    existing_translations = []
    if existing_file.exists():
        with open(existing_file, 'r', encoding='utf-8') as f:
            existing_translations = json.load(f)
        print(f"[OK] Loaded {len(existing_translations)} existing translations")
    
    # Get max ID
    max_id = max([t.get('id', 0) for t in existing_translations] + [0])
    
    # Add new translations with unique IDs
    for trans in new_translations:
        trans['id'] = max_id + 1
        max_id += 1
        existing_translations.append(trans)
    
    # Save merged data
    with open(existing_file, 'w', encoding='utf-8') as f:
        json.dump(existing_translations, f, ensure_ascii=False, indent=2)
    
    print(f"[OK] Merged {len(new_translations)} new translations")
    print(f"[OK] Total translations: {len(existing_translations)}")
    
    return existing_translations

def main():
    """Main function to download and process translation data."""
    print("=" * 60)
    print("Language Translation Dataset - Download Script")
    print("Author: RSK World (https://rskworld.in)")
    print("=" * 60)
    
    all_translations = []
    
    # Download common words (always available)
    common_words = download_common_words()
    all_translations.extend(common_words)
    
    # Try to download from Tatoeba (optional, may fail)
    try:
        tatoeba_files = download_tatoeba_data()
        if tatoeba_files:
            tatoeba_translations = process_tatoeba_data(tatoeba_files)
            all_translations.extend(tatoeba_translations)
    except Exception as e:
        print(f"Note: Could not download Tatoeba data: {e}")
    
    # Try to download from OPUS (optional, may fail)
    try:
        opus_files = download_opus_data()
        if opus_files:
            opus_translations = process_opus_data(opus_files)
            all_translations.extend(opus_translations)
    except Exception as e:
        print(f"Note: Could not download OPUS data: {e}")
    
    # Merge with existing train.json
    train_file = DATA_DIR / 'train.json'
    merged_data = merge_translations(train_file, all_translations)
    
    # Also update sample_data.json with a subset
    sample_file = DATA_DIR / 'sample_data.json'
    with open(sample_file, 'w', encoding='utf-8') as f:
        json.dump(merged_data[:15], f, ensure_ascii=False, indent=2)
    
    print("\n" + "=" * 60)
    print("Download and processing complete!")
    print(f"Total translations available: {len(merged_data)}")
    print("=" * 60)

if __name__ == '__main__':
    main()

398 lines•17.6 KB

python

Theme Settings

Color Scheme

Display Options

Font Size