RSK World - NLP Text Analysis Bot - Project Files Browser | RSK World

setup.py .env.example sample_knowledge.txt text_preprocessing.py example_usage.py

setup.py

"""
Setup script for NLP Text Analysis Bot
Developer: RSK World
Website: https://rskworld.in
Email: help@rskworld.in
Phone: +91 93305 39277
Year: 2026
"""

from setuptools import setup, find_packages

with open("README.md", "r", encoding="utf-8") as fh:
    long_description = fh.read()

with open("requirements.txt", "r", encoding="utf-8") as fh:
    requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]

setup(
    name="nlp-text-analysis-bot",
    version="1.0.0",
    author="RSK World",
    author_email="help@rskworld.in",
    description="Chatbot with natural language processing capabilities for text understanding and analysis",
    long_description=long_description,
    long_description_content_type="text/markdown",
    url="https://rskworld.in",
    packages=find_packages(),
    classifiers=[
        "Development Status :: 4 - Beta",
        "Intended Audience :: Developers",
        "Topic :: Software Development :: Libraries :: Python Modules",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
    ],
    python_requires=">=3.8",
    install_requires=requirements,
)

43 lines•1.4 KB

python

text_preprocessing.py

Raw Download

"""
Text Preprocessing Module
Handles text cleaning, tokenization, and normalization

Developer: RSK World
Website: https://rskworld.in
Email: help@rskworld.in
Phone: +91 93305 39277
Year: 2026
"""

import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet', quiet=True)

try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger', quiet=True)

class TextPreprocessor:
    """
    Text preprocessing class for cleaning and normalizing text
    Developer: RSK World - https://rskworld.in
    """
    
    def __init__(self):
        """Initialize preprocessor with stopwords and lemmatizer"""
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def clean_text(self, text):
        """
        Clean text by removing special characters and normalizing
        
        Args:
            text (str): Raw text input
            
        Returns:
            str: Cleaned text
        """
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove special characters but keep punctuation for sentence detection
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize(self, text):
        """
        Tokenize text into words and sentences
        
        Args:
            text (str): Input text
            
        Returns:
            dict: Tokenized words and sentences
        """
        words = word_tokenize(text)
        sentences = sent_tokenize(text)
        
        return {
            'words': words,
            'sentences': sentences
        }
    
    def remove_stopwords(self, tokens):
        """
        Remove stopwords from token list
        
        Args:
            tokens (list): List of word tokens
            
        Returns:
            list: Tokens without stopwords
        """
        return [token for token in tokens if token.lower() not in self.stop_words]
    
    def lemmatize(self, tokens):
        """
        Lemmatize tokens to their root form
        
        Args:
            tokens (list): List of word tokens
            
        Returns:
            list: Lemmatized tokens
        """
        return [self.lemmatizer.lemmatize(token) for token in tokens]
    
    def preprocess(self, text):
        """
        Complete preprocessing pipeline
        
        Args:
            text (str): Raw input text
            
        Returns:
            dict: Preprocessed text data
        """
        # Clean text
        cleaned_text = self.clean_text(text)
        
        # Tokenize
        tokenized = self.tokenize(cleaned_text)
        
        # Remove stopwords
        filtered_tokens = self.remove_stopwords(tokenized['words'])
        
        # Lemmatize
        lemmatized_tokens = self.lemmatize(filtered_tokens)
        
        return {
            'original_text': text,
            'cleaned_text': cleaned_text,
            'tokens': tokenized['words'],
            'filtered_tokens': filtered_tokens,
            'lemmatized_tokens': lemmatized_tokens,
            'sentences': tokenized['sentences'],
            'word_count': len(tokenized['words']),
            'sentence_count': len(tokenized['sentences']),
            'unique_words': len(set(tokenized['words'])),
            'vocabulary_richness': len(set(tokenized['words'])) / len(tokenized['words']) if tokenized['words'] else 0
        }

154 lines•4.2 KB

python

example_usage.py

Raw Download

"""
Example usage of NLP Text Analysis Bot
Developer: RSK World
Website: https://rskworld.in
Email: help@rskworld.in
Phone: +91 93305 39277
Year: 2026
"""

from nlp_pipeline import NLPPipeline

def main():
    """
    Example usage of the NLP pipeline
    Developer: RSK World - https://rskworld.in
    """
    # Initialize the NLP pipeline
    print("Initializing NLP Pipeline...")
    pipeline = NLPPipeline()
    
    # Example 1: Product review analysis
    print("\n" + "="*60)
    print("Example 1: Product Review Analysis")
    print("="*60)
    
    review = """
    I absolutely love this new smartphone! The camera quality is amazing 
    and the battery life lasts all day. The design is sleek and modern. 
    However, the price is quite high compared to competitors. Overall, 
    it's a great product from Apple Inc.
    """
    
    results = pipeline.analyze(review)
    print(f"Sentiment: {results['sentiment']['label']}")
    print(f"Score: {results['sentiment']['score']:.3f}")
    print(f"Entities: {len(results['entities']['entities'])} found")
    print(f"Key Topics: {', '.join(results['semantic']['topics'][:3])}")
    
    # Example 2: News article analysis
    print("\n" + "="*60)
    print("Example 2: News Article Analysis")
    print("="*60)
    
    news = """
    Microsoft Corporation announced today that it will be opening a new 
    research facility in Seattle, Washington. The company's CEO, Satya 
    Nadella, stated that this investment will create over 500 jobs and 
    focus on artificial intelligence research. This is great news for 
    the local economy and technology sector.
    """
    
    results = pipeline.analyze(news)
    print(f"Sentiment: {results['sentiment']['label']}")
    print(f"Word Count: {results['summary']['word_count']}")
    print(f"Entities Found: {results['summary']['entity_count']}")
    
    # Print all entities
    print("\nDetected Entities:")
    for entity in results['entities']['entities']:
        print(f"  - {entity['text']} ({entity['label']})")
    
    # Example 3: Social media post
    print("\n" + "="*60)
    print("Example 3: Social Media Post Analysis")
    print("="*60)
    
    post = "Just had the worst experience at the restaurant. Terrible service and cold food. Never going back!"
    
    results = pipeline.analyze(post)
    print(f"Sentiment: {results['sentiment']['label']}")
    print(f"Sentiment Score: {results['sentiment']['score']:.3f}")
    if results['sentiment'].get('vader'):
        vader = results['sentiment']['vader']
        print(f"VADER Breakdown - Positive: {vader['positive']:.2%}, "
              f"Neutral: {vader['neutral']:.2%}, Negative: {vader['negative']:.2%}")

if __name__ == '__main__':
    main()

80 lines•2.7 KB

python

Theme Settings

Color Scheme

Display Options

Font Size