These recipes demonstrate practical applications of Wikilangs models. Each recipe includes
context on why the approach works, when to use it, and complete code you can
adapt for your projects.
The problem: Given a piece of text, determine what language it's written in.
This is fundamental for multilingual pipelines—you need to know the language before you can
apply language-specific processing.
The insight: A language model trained on French will assign higher probability
to French text than to German text. By scoring the same text against multiple language models,
the highest-scoring one reveals the language.
When to use: User-generated content, web scraping, document classification,
multilingual chatbots, or any time you receive text of unknown origin.
from wikilangs import ngram
def detect_language(text, candidates=['en', 'fr', 'de', 'es', 'ar', 'zh', 'ja', 'ru']):
"""
Detect the language of input text by scoring against multiple n-gram models.
Returns the most likely language and all scores for analysis.
Higher scores (less negative log probabilities) indicate better fit.
"""
scores = {}
for lang in candidates:
try:
ng = ngram(date='latest', lang=lang, gram_size=3)
scores[lang] = ng.score(text)
except Exception:
# Language model not available
continue
if not scores:
return None, {}
# The language with highest score (least negative) wins
detected = max(scores, key=scores.get)
return detected, scores
# Try it out
examples = [
"Bonjour, comment allez-vous aujourd'hui?",
"The weather is beautiful this morning.",
"Das ist ein interessantes Buch.",
"Hola, ¿cómo estás?",
]
for text in examples:
lang, scores = detect_language(text)
print(f"'{text[:40]}...' -> {lang}")
Tips & Variations
- Use 3-grams for a good balance of speed and accuracy
- For short texts (under 20 characters), consider using character-level models
- Normalize scores by text length for fairer comparison across different-length inputs
- Add confidence thresholds—if top two scores are close, report "uncertain"
The problem: Find the correct spelling for a misspelled word. This is
essential for search engines, form validation, and text preprocessing.
The insight: Most misspellings are close to the correct word (a few
character edits away). By combining edit distance with word frequency, we can suggest
corrections that are both similar to the input AND actually common words in the language.
When to use: Search query correction, OCR post-processing, user input
validation, educational tools for language learners.
from wikilangs import vocabulary
def edit_distance(s1, s2):
"""Compute Levenshtein edit distance between two strings."""
if len(s1) < len(s2):
s1, s2 = s2, s1
if len(s2) == 0:
return len(s1)
prev = range(len(s2) + 1)
for i, c1 in enumerate(s1):
curr = [i + 1]
for j, c2 in enumerate(s2):
# Cost is 0 if characters match, 1 otherwise
cost = 0 if c1 == c2 else 1
curr.append(min(
prev[j + 1] + 1, # deletion
curr[j] + 1, # insertion
prev[j] + cost # substitution
))
prev = curr
return prev[-1]
def suggest_corrections(word, lang='en', max_distance=2, top_k=5):
"""
Suggest spelling corrections ranked by frequency.
We search words sharing the same prefix (most typos preserve
the first few characters) and filter by edit distance.
"""
vocab = vocabulary(date='latest', lang=lang)
# Search with progressively shorter prefixes
candidates = []
for prefix_len in range(min(3, len(word)), 0, -1):
prefix = word[:prefix_len]
candidates = vocab.get_words_with_prefix(prefix, top_k=200)
if len(candidates) >= 10:
break
# Score candidates by edit distance and frequency
suggestions = []
for candidate in candidates:
dist = edit_distance(word.lower(), candidate.lower())
if 0 < dist <= max_distance: # Exclude exact matches
info = vocab.lookup(candidate)
if info:
suggestions.append({
'word': candidate,
'distance': dist,
'frequency': info.get('frequency', 0)
})
# Sort by distance first, then by frequency (descending)
suggestions.sort(key=lambda x: (x['distance'], -x['frequency']))
return suggestions[:top_k]
# Example usage
misspelled = ["langauge", "recieve", "occurence", "accomodate", "seperate"]
for word in misspelled:
corrections = suggest_corrections(word)
if corrections:
best = corrections[0]['word']
print(f"'{word}' -> '{best}' (and {len(corrections)-1} more options)")
Tips & Variations
- For real-time applications, precompute a prefix tree (trie) of the vocabulary
- Consider phonetic similarity (Soundex, Metaphone) for homophones
- Use n-gram context to pick the right correction ("their" vs "there")
The problem: Automatically categorize text into predefined topics.
This powers spam filters, content moderation, news categorization, and customer support routing.
The insight: Different topics use different vocabularies. A sports
article will contain "game", "team", "score"; a tech article will have "algorithm",
"software", "data". By weighting keywords with IDF (inverse document frequency),
we emphasize distinctive words over common ones.
When to use: When you need a fast, interpretable classifier without
training data. Good for bootstrapping before you have labeled examples.
from wikilangs import vocabulary, tokenizer
def classify_text(text, categories, lang='en'):
"""
Classify text into categories using keyword matching with IDF weighting.
Args:
text: The text to classify
categories: Dict mapping category names to lists of keywords
e.g., {'sports': ['game', 'team', 'player']}
Returns category name and scores for all categories.
"""
vocab = vocabulary(date='latest', lang=lang)
tok = tokenizer(date='latest', lang=lang, vocab_size=32000)
# Tokenize and normalize
tokens = tok.tokenize(text.lower())
words = set(t.strip('_').lower() for t in tokens if len(t.strip('_')) > 2)
scores = {}
matched_keywords = {}
for category, keywords in categories.items():
score = 0
matches = []
for keyword in keywords:
if keyword.lower() in words:
info = vocab.lookup(keyword)
# IDF weighting: rare words count more
weight = info.get('idf_score', 1.0) if info else 1.0
score += weight
matches.append(keyword)
scores[category] = score
matched_keywords[category] = matches
# Handle case where no keywords match
if all(s == 0 for s in scores.values()):
return 'unknown', scores, matched_keywords
best = max(scores, key=scores.get)
return best, scores, matched_keywords
# Define your categories with characteristic keywords
categories = {
'technology': ['computer', 'software', 'algorithm', 'data', 'programming',
'code', 'digital', 'internet', 'machine', 'artificial'],
'sports': ['game', 'team', 'player', 'score', 'match', 'championship',
'win', 'league', 'coach', 'season'],
'science': ['research', 'study', 'experiment', 'theory', 'discovery',
'scientist', 'laboratory', 'hypothesis', 'analysis', 'evidence'],
'politics': ['government', 'election', 'policy', 'vote', 'president',
'congress', 'law', 'political', 'democracy', 'campaign']
}
# Test it
articles = [
"The research team published a groundbreaking study on machine learning algorithms.",
"The championship game ended with a dramatic last-minute score by the home team.",
"Congress passed new legislation affecting digital privacy and data protection.",
]
for article in articles:
category, scores, matches = classify_text(article, categories)
print(f"Category: {category}")
print(f" Matched: {matches[category]}")
print()
Tips & Variations
- Expand keyword lists using embeddings to find similar words automatically
- Add negative keywords that should reduce a category's score
- For production, train a proper classifier (logistic regression, SVM) on labeled data
The problem: Measure how semantically similar two pieces of text are.
This powers duplicate detection, plagiarism checking, and semantic search.
The insight: Word embeddings represent words as vectors in a space
where similar words are close together. By combining word vectors into sentence vectors,
we can compare entire sentences using cosine similarity.
When to use: Finding similar documents, FAQ matching, detecting
paraphrases, clustering related content, recommendation systems.
from wikilangs import embeddings
import numpy as np
def cosine_similarity(v1, v2):
"""
Compute cosine similarity between two vectors.
Returns value between -1 (opposite) and 1 (identical).
"""
dot = np.dot(v1, v2)
norm = np.linalg.norm(v1) * np.linalg.norm(v2)
return dot / norm if norm > 0 else 0
def text_similarity(text1, text2, lang='en', method='rope'):
"""
Compute semantic similarity between two texts.
Methods:
- 'average': Simple mean of word vectors
- 'rope': RoPE positional encoding (preserves word order)
- 'decay': Earlier words weighted more heavily
"""
emb = embeddings(date='latest', lang=lang, dimension=64)
vec1 = emb.embed_sentence(text1, method=method)
vec2 = emb.embed_sentence(text2, method=method)
return cosine_similarity(vec1, vec2)
def find_most_similar(query, candidates, lang='en'):
"""Find the most similar text to a query from a list of candidates."""
similarities = []
for i, candidate in enumerate(candidates):
sim = text_similarity(query, candidate, lang)
similarities.append((i, candidate, sim))
similarities.sort(key=lambda x: x[2], reverse=True)
return similarities
# Example: FAQ matching
query = "How do I reset my password?"
faq_questions = [
"What are your business hours?",
"How can I change my account password?",
"Where is your office located?",
"How do I update my email address?",
"What payment methods do you accept?",
]
print(f"Query: '{query}'\n")
results = find_most_similar(query, faq_questions)
print("Ranked by similarity:")
for i, (idx, question, sim) in enumerate(results[:3]):
print(f" {i+1}. {question}")
print(f" Similarity: {sim:.3f}")
Tips & Variations
- Use 'rope' for order-sensitive comparisons (questions, instructions)
- Use 'average' when word order doesn't matter (keywords, topics)
- For large datasets, use approximate nearest neighbor search (FAISS, Annoy)
The problem: Suggest completions as users type, whether finishing
the current word or predicting the next one. This improves UX and typing speed.
The insight: Prefix search finds words that start with what the user
has typed. N-gram models predict what word typically comes next given the previous words.
Combining both gives a powerful autocomplete system.
When to use: Search boxes, chat interfaces, code editors,
mobile keyboards, form fields.
from wikilangs import ngram, vocabulary
def autocomplete(prefix, lang='en', word_limit=5, next_word_limit=5):
"""
Provide autocomplete suggestions for a text prefix.
Returns:
- word_completions: Ways to finish the current word
- next_words: Predictions for the next word (if current word is complete)
"""
vocab = vocabulary(date='latest', lang=lang)
ng = ngram(date='latest', lang=lang, gram_size=3)
words = prefix.strip().split()
results = {
'word_completions': [],
'next_words': [],
'input_complete': prefix.endswith(' ')
}
# Word completion: user is mid-word
if words and not prefix.endswith(' '):
partial = words[-1]
completions = vocab.get_words_with_prefix(partial, top_k=word_limit * 2)
# Filter out the partial word itself
results['word_completions'] = [w for w in completions
if w.lower() != partial.lower()][:word_limit]
# Next word prediction: use n-gram model
if len(words) >= 1:
# Use last 2 words as context for trigram
context = ' '.join(words[-2:]) if len(words) >= 2 else words[-1]
predictions = ng.predict_next(context, top_k=next_word_limit)
results['next_words'] = [word for word, prob in predictions]
return results
# Simulate typing
test_inputs = [
"The quick bro", # Mid-word: should complete "brown", etc.
"The quick brown ", # Complete: should predict "fox", etc.
"machine learn", # Mid-word: "learning", "learner"
"I want to ", # Complete: predict next word
]
for prefix in test_inputs:
result = autocomplete(prefix)
print(f"Input: '{prefix}'")
if result['word_completions']:
print(f" Complete word: {result['word_completions'][:3]}")
if result['next_words']:
print(f" Next word: {result['next_words'][:3]}")
print()
Tips & Variations
- Cache vocabulary lookups for sub-100ms response times
- Personalize with user history (boost previously typed words)
- For code editors, use subword tokenizers to complete partial identifiers
The problem: Generate text continuations or writing prompts to help
overcome writer's block or explore different directions.
The insight: Markov chains are probabilistic—they don't always pick the
most likely next word, which creates variety. By seeding with a phrase, we constrain
the generation to stay on topic while exploring unexpected paths.
When to use: Creative writing tools, brainstorming aids, educational
exercises, generating placeholder text, exploring story directions.
from wikilangs import markov
import random
def generate_continuations(opening, lang='en', num_variations=3, length=50):
"""
Generate multiple possible continuations for an opening sentence.
Useful for exploring different story directions.
"""
mc = markov(date='latest', lang=lang, depth=3)
# Extract seed from opening (last few words)
words = opening.strip().split()
seed = words[-3:] if len(words) >= 3 else words
continuations = []
for i in range(num_variations):
generated = mc.generate(length=length, seed=seed)
# Combine opening with generated text
full_text = opening.rstrip() + ' ' + generated
continuations.append(full_text)
return continuations
def generate_writing_prompts(themes, lang='en', num_prompts=2):
"""
Generate writing prompts from theme words.
Good for creative exercises and overcoming writer's block.
"""
mc = markov(date='latest', lang=lang, depth=2)
prompts = []
for theme in themes:
for _ in range(num_prompts):
# Generate a phrase starting with the theme
continuation = mc.generate(length=15, seed=[theme])
prompt = f"Write about: {theme.title()} — {continuation}..."
prompts.append(prompt)
return prompts
# Generate story continuations
print("=== Story Continuations ===")
opening = "The old lighthouse stood alone on the cliff"
continuations = generate_continuations(opening, num_variations=3, length=30)
for i, cont in enumerate(continuations, 1):
print(f"\nVersion {i}:")
print(f" {cont[:150]}...")
# Generate writing prompts
print("\n=== Writing Prompts ===")
themes = ['mystery', 'journey', 'memory']
prompts = generate_writing_prompts(themes, num_prompts=1)
for prompt in prompts:
print(f"• {prompt}")
Tips & Variations
- Higher depth (3-5) produces more coherent but less surprising text
- Mix outputs from different depth models for variety
- Use multiple languages for multilingual creative writing exercises
The problem: Detect when text switches between languages mid-sentence.
This is common in multilingual communities—Spanglish, Hinglish, Arabic-French mixing, etc.
The insight: By scoring sliding windows of text against multiple language
models, we can identify where the language changes. Each window gets assigned to the
language whose model scores it highest.
When to use: Social media analysis, transcription services, studying
bilingual speech patterns, preprocessing for language-specific tools.
from wikilangs import ngram
def detect_code_switching(text, lang1='en', lang2='es', window_size=3):
"""
Detect language switching within text.
Returns list of (segment, language) tuples showing where
the text switches between languages.
"""
# Load models for both languages
ng1 = ngram(date='latest', lang=lang1, gram_size=2)
ng2 = ngram(date='latest', lang=lang2, gram_size=2)
words = text.split()
if len(words) < window_size:
# Text too short, score as single segment
score1 = ng1.score(text)
score2 = ng2.score(text)
lang = lang1 if score1 > score2 else lang2
return [(text, lang)]
# Score each window
segments = []
i = 0
while i < len(words):
window = ' '.join(words[i:i + window_size])
score1 = ng1.score(window)
score2 = ng2.score(window)
detected = lang1 if score1 > score2 else lang2
segments.append((window, detected))
i += window_size
# Merge consecutive same-language segments
merged = []
for segment, lang in segments:
if merged and merged[-1][1] == lang:
merged[-1] = (merged[-1][0] + ' ' + segment, lang)
else:
merged.append((segment, lang))
return merged
# Example: Spanglish text
examples = [
"I went to the store porque necesitaba comprar some milk for breakfast",
"My abuela always says que la familia es lo más importante",
"Let's meet mañana at the coffee shop cerca de tu casa",
]
print("=== Code-Switching Detection ===\n")
for text in examples:
print(f"Input: {text}")
segments = detect_code_switching(text, 'en', 'es')
for segment, lang in segments:
print(f" [{lang.upper()}] {segment}")
print()
Tips & Variations
- Smaller windows detect finer-grained switches but are noisier
- Use character n-grams for very short switches (single words)
- This technique works for any language pair with trained models
The problem: Assess how difficult a text is to read. This helps writers
target the right audience and ensures accessibility.
The insight: Word frequency is a strong proxy for familiarity. Common
words (rank 1-2000) are learned early; rare words require specialized knowledge.
By analyzing the frequency distribution of words, we estimate reading difficulty.
When to use: Content writing tools, educational material assessment,
accessibility checking, plain language compliance.
from wikilangs import vocabulary, tokenizer
import numpy as np
def analyze_readability(text, lang='en'):
"""
Analyze text readability based on vocabulary frequency.
Returns detailed metrics about word complexity and
an overall readability assessment.
"""
vocab = vocabulary(date='latest', lang=lang)
tok = tokenizer(date='latest', lang=lang, vocab_size=32000)
# Tokenize and filter to words
tokens = tok.tokenize(text)
words = [t.strip('_') for t in tokens
if t.strip('_').isalpha() and len(t.strip('_')) > 1]
if not words:
return None
# Look up ranks for each word
ranks = []
rare_words = []
unknown_words = []
for word in words:
info = vocab.lookup(word.lower())
if info and info.get('rank'):
rank = info['rank']
ranks.append(rank)
if rank > 10000:
rare_words.append(word)
else:
unknown_words.append(word)
if not ranks:
return {'error': 'Could not analyze - no known words'}
avg_rank = np.mean(ranks)
median_rank = np.median(ranks)
# Determine reading level
if median_rank < 1500:
level = 'elementary'
description = 'Suitable for general audiences, easy to read'
elif median_rank < 3500:
level = 'intermediate'
description = 'Standard difficulty, comfortable for educated readers'
elif median_rank < 7000:
level = 'advanced'
description = 'Requires good vocabulary, some specialized terms'
else:
level = 'specialized'
description = 'Technical or academic, requires domain knowledge'
return {
'word_count': len(words),
'unique_words': len(set(w.lower() for w in words)),
'vocabulary_diversity': len(set(w.lower() for w in words)) / len(words),
'average_word_rank': round(avg_rank),
'median_word_rank': round(median_rank),
'rare_word_percentage': round(len(rare_words) / len(words) * 100, 1),
'sample_rare_words': list(set(rare_words))[:5],
'reading_level': level,
'description': description
}
# Compare different texts
texts = {
'children': "The cat sat on the mat. It was a sunny day. The children played in the park.",
'news': "The committee announced new policy measures aimed at reducing inflation while maintaining employment levels.",
'academic': "The epistemological implications of quantum entanglement challenge our fundamental ontological assumptions about locality and causality."
}
print("=== Readability Analysis ===\n")
for label, text in texts.items():
result = analyze_readability(text)
print(f"{label.upper()}:")
print(f" Level: {result['reading_level']} ({result['description']})")
print(f" Median word rank: {result['median_word_rank']}")
print(f" Rare words: {result['rare_word_percentage']}%")
if result['sample_rare_words']:
print(f" Examples: {', '.join(result['sample_rare_words'])}")
print()
Tips & Variations
- Combine with sentence length metrics for fuller readability picture
- Compare against target audience benchmarks (e.g., grade level)
- Suggest simpler alternatives for rare words
The problem: Find words in another language that might be related to
a word in your source language—translations, cognates, or conceptually similar terms.
The insight: If embeddings are trained on parallel or comparable corpora
(like Wikipedia), similar concepts end up in similar vector spaces. This lets us bridge
languages without explicit translation dictionaries.
When to use: Building bilingual dictionaries, finding translation candidates,
language learning tools, cross-lingual information retrieval.
from wikilangs import embeddings, vocabulary
import numpy as np
def find_similar_words_cross_lingual(word, source_lang, target_lang, top_k=10):
"""
Find words in target language that are semantically similar
to a word in the source language.
Note: Works best for concepts that exist in both languages'
Wikipedia (shared cultural concepts, technical terms, etc.)
"""
# Load embeddings for both languages
source_emb = embeddings(date='latest', lang=source_lang, dimension=64)
target_emb = embeddings(date='latest', lang=target_lang, dimension=64)
target_vocab = vocabulary(date='latest', lang=target_lang)
# Get source word vector
try:
source_vec = source_emb.embed_word(word)
except KeyError:
return {'error': f"Word '{word}' not found in {source_lang} embeddings"}
# Compare with frequent target words
# (Using top words for efficiency; expand for better coverage)
target_words = target_vocab.get_words_with_prefix('', top_k=5000)
similarities = []
for tw in target_words:
try:
target_vec = target_emb.embed_word(tw)
# Cosine similarity
sim = np.dot(source_vec, target_vec) / (
np.linalg.norm(source_vec) * np.linalg.norm(target_vec)
)
similarities.append((tw, float(sim)))
except (KeyError, Exception):
continue
# Sort by similarity
similarities.sort(key=lambda x: x[1], reverse=True)
return similarities[:top_k]
# Example: Find French words similar to English concepts
print("=== Cross-Lingual Word Discovery ===\n")
test_words = ['computer', 'science', 'music', 'democracy']
for word in test_words:
print(f"English '{word}' -> French similar words:")
results = find_similar_words_cross_lingual(word, 'en', 'fr', top_k=5)
if isinstance(results, dict) and 'error' in results:
print(f" {results['error']}")
else:
for french_word, sim in results:
print(f" {french_word}: {sim:.3f}")
print()
Tips & Variations
- Results improve with larger embedding dimensions (128d)
- Filter target words by part-of-speech for cleaner results
- Use this to bootstrap translation memory for low-resource language pairs
The problem: Generate variations of training data to improve model
robustness. More diverse training examples help models generalize better.
The insight: Markov chains and vocabulary can generate plausible
word insertions and substitutions that preserve meaning while adding variety.
Unlike random perturbations, these changes respect language structure.
When to use: Expanding small training datasets, improving model
robustness, testing system behavior with varied inputs.
from wikilangs import markov, vocabulary, tokenizer
import random
def augment_text(text, lang='en', num_augments=5):
"""
Generate augmented versions of text for training data expansion.
Uses multiple strategies:
1. Contextual word insertion (Markov chains)
2. Synonym-like substitution (prefix matching)
"""
mc = markov(date='latest', lang=lang, depth=2)
vocab = vocabulary(date='latest', lang=lang)
tok = tokenizer(date='latest', lang=lang, vocab_size=32000)
words = text.split()
augmented = []
for _ in range(num_augments):
new_words = words.copy()
# Strategy 1: Insert a contextual word
if len(words) > 3 and random.random() < 0.5:
insert_pos = random.randint(1, len(new_words) - 1)
# Get context from preceding words
context = tuple(new_words[max(0, insert_pos-2):insert_pos])
transitions = mc.get_transitions(context)
if transitions:
# Sample from likely next words
candidates = list(transitions.keys())[:10]
if candidates:
new_words.insert(insert_pos, random.choice(candidates))
# Strategy 2: Replace a word with similar one
if len(new_words) > 2 and random.random() < 0.5:
# Pick a random content word (skip short words)
content_positions = [i for i, w in enumerate(new_words)
if len(w) > 4 and w.isalpha()]
if content_positions:
replace_pos = random.choice(content_positions)
word = new_words[replace_pos]
# Find words with same prefix
similar = vocab.get_words_with_prefix(word[:3], top_k=10)
similar = [w for w in similar if w.lower() != word.lower()]
if similar:
new_words[replace_pos] = random.choice(similar)
augmented.append(' '.join(new_words))
# Remove duplicates while preserving order
seen = set()
unique = []
for text in augmented:
if text not in seen and text != ' '.join(words):
seen.add(text)
unique.append(text)
return unique
# Example: Augment training examples
print("=== Data Augmentation ===\n")
examples = [
"The neural network achieved high accuracy on the test set.",
"Customer service representatives handle incoming calls efficiently.",
]
for original in examples:
print(f"Original: {original}")
variations = augment_text(original, num_augments=5)
print("Augmented:")
for i, var in enumerate(variations[:3], 1):
print(f" {i}. {var}")
print()
Tips & Variations
- Validate augmented samples maintain label correctness
- Combine with back-translation for more diverse augmentation
- Adjust augmentation aggressiveness based on original dataset size
The problem: Identify text that doesn't fit expected patterns—gibberish,
spam, corrupted data, or out-of-domain content.
The insight: Language models assign low probability (high perplexity)
to unusual text. Random characters, spam patterns, and text in the wrong language
will score significantly worse than normal text.
When to use: Data cleaning pipelines, spam detection, quality assurance
for user-generated content, detecting corrupted records.
from wikilangs import ngram
import numpy as np
def detect_anomalies(texts, lang='en', percentile_threshold=90):
"""
Identify anomalous texts that don't fit normal language patterns.
Uses perplexity (how "surprised" the model is) to detect:
- Random/gibberish text
- Spam patterns
- Wrong language
- Corrupted data
Returns texts scoring above the threshold percentile.
"""
ng = ngram(date='latest', lang=lang, gram_size=3)
# Score all texts
scored = []
for text in texts:
if not text.strip():
continue
log_prob = ng.score(text)
# Normalize by length (per-word perplexity proxy)
word_count = max(len(text.split()), 1)
perplexity = -log_prob / word_count
scored.append({
'text': text,
'perplexity': perplexity,
'log_prob': log_prob
})
if not scored:
return [], 0
# Find threshold
perplexities = [s['perplexity'] for s in scored]
threshold = np.percentile(perplexities, percentile_threshold)
# Identify anomalies (high perplexity = unusual)
anomalies = []
for item in scored:
if item['perplexity'] > threshold:
preview = item['text'][:80] + '...' if len(item['text']) > 80 else item['text']
anomalies.append({
'text': preview,
'perplexity': round(item['perplexity'], 2),
'reason': classify_anomaly(item['perplexity'], threshold)
})
return anomalies, threshold
def classify_anomaly(perplexity, threshold):
"""Provide a human-readable reason for the anomaly."""
ratio = perplexity / threshold
if ratio > 3:
return "Severely unusual - likely gibberish or wrong language"
elif ratio > 2:
return "Very unusual - possibly spam or corrupted"
else:
return "Mildly unusual - may need review"
# Example: Find anomalies in a mixed dataset
print("=== Anomaly Detection ===\n")
texts = [
# Normal texts
"The weather forecast predicts sunny skies for tomorrow.",
"Machine learning algorithms require substantial training data.",
"The committee will review the proposal next week.",
"Scientists discovered a new species in the rainforest.",
# Anomalies
"asdfghjkl qwerty zxcvbnm random keyboard mashing here",
"$$$ CLICK NOW !!! FREE MONEY !!! BUY BUY BUY !!!",
"Ceci est un texte en français dans un corpus anglais", # French
"aaa bbb ccc ddd eee fff ggg hhh iii jjj kkk lll mmm",
]
anomalies, threshold = detect_anomalies(texts, percentile_threshold=75)
print(f"Threshold perplexity: {threshold:.2f}\n")
print(f"Found {len(anomalies)} anomalies:\n")
for a in anomalies:
print(f"• {a['text']}")
print(f" Perplexity: {a['perplexity']} | {a['reason']}")
print()
Tips & Variations
- Adjust percentile threshold based on expected anomaly rate
- Use multiple language models to detect wrong-language content
- Combine with rule-based filters for known spam patterns