logicaffeine_language/
lexicon.rs

1//! Lexicon: Vocabulary lookup functions
2//!
3//! This module includes the compile-time generated lexicon lookup code
4//! from build.rs. It provides ~56 lookup functions for classifying words.
5
6// Include the generated lexicon lookup functions
7include!(concat!(env!("OUT_DIR"), "/lexicon_data.rs"));
8
9// Re-export types from lexicon crate that aren't defined in generated code
10// Note: Polarity, CanonicalMapping are defined in lexicon_data.rs
11pub use logicaffeine_lexicon::{
12    Aspect, Case, Definiteness, Feature, Gender, Number, Sort, Time, VerbClass,
13    AdjectiveMetadata, MorphologicalRule, NounMetadata, VerbEntry, VerbMetadata,
14};
15
16/// Get canonical verb form and whether it's lexically negative.
17/// Used at parse time to transform "lacks" → ("Have", true).
18/// Returns (canonical_lemma, is_negative).
19pub fn get_canonical_verb(lemma: &str) -> Option<(&'static str, bool)> {
20    lookup_canonical(lemma).map(|m| (m.lemma, m.polarity == Polarity::Negative))
21}
22
23/// Lexicon trait for abstracting over static and dynamic lexicons
24pub trait LexiconTrait {
25    fn lookup_verb(&self, word: &str) -> Option<VerbMetadata>;
26    fn lookup_noun(&self, word: &str) -> Option<NounMetadata>;
27    fn lookup_adjective(&self, word: &str) -> Option<AdjectiveMetadata>;
28}
29
30/// Static lexicon implementation using compile-time generated data
31pub struct StaticLexicon;
32
33impl LexiconTrait for StaticLexicon {
34    fn lookup_verb(&self, word: &str) -> Option<VerbMetadata> {
35        lookup_verb_db(word)
36    }
37
38    fn lookup_noun(&self, word: &str) -> Option<NounMetadata> {
39        lookup_noun_db(word)
40    }
41
42    fn lookup_adjective(&self, word: &str) -> Option<AdjectiveMetadata> {
43        lookup_adjective_db(word)
44    }
45}
46
47/// Lexicon struct for verb lookup with inflection handling
48pub struct Lexicon {}
49
50impl Lexicon {
51    pub fn new() -> Self {
52        Lexicon {}
53    }
54
55    pub fn lookup_verb(&self, word: &str) -> Option<VerbEntry> {
56        let lower = word.to_lowercase();
57
58        if let Some(entry) = lookup_irregular_verb(&lower) {
59            return Some(entry);
60        }
61
62        if lower.ends_with("ing") {
63            let stem = self.strip_ing(&lower);
64            let lemma = Self::capitalize(&stem);
65            let class = self.lookup_verb_class(&lemma.to_lowercase());
66            return Some(VerbEntry {
67                lemma,
68                time: Time::None,
69                aspect: Aspect::Progressive,
70                class,
71            });
72        }
73
74        if lower.ends_with("ed") {
75            let stem = self.strip_ed(&lower);
76            // Only treat as verb if the stem is a known base verb
77            // This prevents "doomed" → "Doom" when "doom" isn't in lexicon
78            if !is_base_verb(&stem) {
79                return None;
80            }
81            let lemma = Self::capitalize(&stem);
82            let class = self.lookup_verb_class(&lemma.to_lowercase());
83            return Some(VerbEntry {
84                lemma,
85                time: Time::Past,
86                aspect: Aspect::Simple,
87                class,
88            });
89        }
90
91        let is_third_person = if lower.ends_with("es") && lower.len() > 2 {
92            true
93        } else if lower.ends_with("s") && !lower.ends_with("ss") && lower.len() > 2 {
94            true
95        } else {
96            false
97        };
98
99        if is_third_person {
100            if is_stemming_exception(&lower) {
101                return None;
102            }
103
104            let stem = self.strip_s(&lower);
105            if !is_base_verb(&stem) {
106                return None;
107            }
108            let lemma = Self::capitalize(&stem);
109            let class = self.lookup_verb_class(&lemma.to_lowercase());
110            return Some(VerbEntry {
111                lemma,
112                time: Time::Present,
113                aspect: Aspect::Simple,
114                class,
115            });
116        }
117
118        // Check if this is a base verb form
119        if is_base_verb(&lower) {
120            let lemma = Self::capitalize(&lower);
121            let class = self.lookup_verb_class(&lower);
122            return Some(VerbEntry {
123                lemma,
124                time: Time::Present,
125                aspect: Aspect::Simple,
126                class,
127            });
128        }
129
130        None
131    }
132
133    fn lookup_verb_class(&self, lemma: &str) -> VerbClass {
134        lookup_verb_class(lemma)
135    }
136
137    fn strip_ing(&self, word: &str) -> String {
138        let base = &word[..word.len() - 3];
139
140        if base.len() >= 2 {
141            let chars: Vec<char> = base.chars().collect();
142            let last = chars[chars.len() - 1];
143            let second_last = chars[chars.len() - 2];
144
145            if last == second_last && !"aeiou".contains(last) {
146                return base[..base.len() - 1].to_string();
147            }
148        }
149
150        if needs_e_ing(base) {
151            return format!("{}e", base);
152        }
153
154        base.to_string()
155    }
156
157    fn strip_ed(&self, word: &str) -> String {
158        let base = &word[..word.len() - 2];
159
160        if base.ends_with("i") {
161            return format!("{}y", &base[..base.len() - 1]);
162        }
163
164        if base.len() >= 2 {
165            let chars: Vec<char> = base.chars().collect();
166            let last = chars[chars.len() - 1];
167            let second_last = chars[chars.len() - 2];
168
169            // Doubled consonant handling for verbs like "stopped" → "stop"
170            // BUT: first check if the base WITH doubled consonant is already a verb
171            // This handles words like "passed" → "pass" (natural double 's')
172            if last == second_last && !"aeiou".contains(last) {
173                // First try the base as-is (handles "pass", "miss", "kiss", etc.)
174                if is_base_verb(base) {
175                    return base.to_string();
176                }
177                // Otherwise strip the doubled consonant (handles "stopped" → "stop")
178                return base[..base.len() - 1].to_string();
179            }
180
181            // Consonant clusters that typically come from silent-e verbs:
182            // "tabled" → "tabl" needs "e", "googled" → "googl" needs "e"
183            // Pattern: consonant + l/r at end, with vowel before the consonant
184            if (last == 'l' || last == 'r') && !"aeiou".contains(second_last) {
185                if chars.len() >= 3 && "aeiou".contains(chars[chars.len() - 3]) {
186                    return format!("{}e", base);
187                }
188            }
189        }
190
191        if needs_e_ed(base) {
192            return format!("{}e", base);
193        }
194
195        // Fallback: try adding 'e' and check if that's a valid verb
196        // This handles all silent-e verbs not explicitly in needs_e_ed
197        // e.g., "escaped" → "escap" → "escape" (valid verb)
198        let with_e = format!("{}e", base);
199        if is_base_verb(&with_e) {
200            return with_e;
201        }
202
203        base.to_string()
204    }
205
206    fn strip_s(&self, word: &str) -> String {
207        if word.ends_with("ies") {
208            return format!("{}y", &word[..word.len() - 3]);
209        }
210        // For verbs ending in silent 'e': hopes → hope, decides → decide
211        // These add "s" not "es", so stripping just "s" gives correct lemma
212        if word.ends_with("es") {
213            let base_minus_es = &word[..word.len() - 2];
214            let base_minus_s = &word[..word.len() - 1];
215            // If base-1 ends in 'e', probably a silent-e verb: hopes → hope
216            if base_minus_s.ends_with('e') {
217                return base_minus_s.to_string();
218            }
219            // Otherwise it's a sibilant ending: watches → watch, fixes → fix
220            return base_minus_es.to_string();
221        }
222        word[..word.len() - 1].to_string()
223    }
224
225    fn capitalize(s: &str) -> String {
226        let mut chars = s.chars();
227        match chars.next() {
228            None => String::new(),
229            Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
230        }
231    }
232}
233
234impl Default for Lexicon {
235    fn default() -> Self {
236        Self::new()
237    }
238}
239
240/// Result of smart word analysis for derivational morphology
241#[derive(Debug, Clone, PartialEq, Eq)]
242pub enum WordAnalysis {
243    /// A dictionary entry (exact match or derived plural)
244    Noun(NounMetadata),
245    /// A word derived via morphological rules (agentive nouns like "blogger")
246    DerivedNoun {
247        lemma: String,
248        number: Number,
249    },
250}
251
252/// Smart word analysis with derivational morphology support.
253///
254/// Three-step resolution:
255/// 1. **Exact Match** - Check if word exists in lexicon (handles irregulars like "mice")
256/// 2. **Plural Derivation** - Strip 's'/'es' and check if stem exists (farmers → farmer)
257/// 3. **Morphological Rules** - Apply suffix rules for unknown agentive nouns
258pub fn analyze_word(word: &str) -> Option<WordAnalysis> {
259    let lower = word.to_lowercase();
260
261    // 1. EXACT MATCH (Fast Path)
262    // Handles explicit entries like "farmer", "mice", "children"
263    if let Some(meta) = lookup_noun_db(&lower) {
264        return Some(WordAnalysis::Noun(meta));
265    }
266
267    // 2. PLURAL DERIVATION (Smart Path)
268    // "farmers" → stem "farmer" → lookup
269    if lower.ends_with('s') && lower.len() > 2 {
270        // Try simple 's' stripping: "farmers" -> "farmer"
271        let stem = &lower[..lower.len() - 1];
272        if let Some(meta) = lookup_noun_db(stem) {
273            // Found the singular base - return as plural
274            return Some(WordAnalysis::Noun(NounMetadata {
275                lemma: meta.lemma,
276                number: Number::Plural,
277                features: meta.features,
278            }));
279        }
280
281        // Try 'es' stripping: "boxes" -> "box", "churches" -> "church"
282        if lower.ends_with("es") && lower.len() > 3 {
283            let stem_es = &lower[..lower.len() - 2];
284            if let Some(meta) = lookup_noun_db(stem_es) {
285                return Some(WordAnalysis::Noun(NounMetadata {
286                    lemma: meta.lemma,
287                    number: Number::Plural,
288                    features: meta.features,
289                }));
290            }
291        }
292
293        // Try 'ies' -> 'y': "cities" -> "city"
294        if lower.ends_with("ies") && lower.len() > 4 {
295            let stem_ies = format!("{}y", &lower[..lower.len() - 3]);
296            if let Some(meta) = lookup_noun_db(&stem_ies) {
297                return Some(WordAnalysis::Noun(NounMetadata {
298                    lemma: meta.lemma,
299                    number: Number::Plural,
300                    features: meta.features,
301                }));
302            }
303        }
304    }
305
306    // 3. MORPHOLOGICAL RULES (Data-driven from lexicon.json)
307    // Handle agentive nouns like "blogger", "vlogger" even if not in lexicon
308    for rule in get_morphological_rules() {
309        // Check plural form first (e.g., "vloggers" -> "vlogger" -> rule match)
310        let (is_plural, check_word) = if lower.ends_with('s') && !rule.suffix.ends_with('s') {
311            (true, &lower[..lower.len() - 1])
312        } else {
313            (false, lower.as_str())
314        };
315
316        if check_word.ends_with(rule.suffix) {
317            return Some(WordAnalysis::DerivedNoun {
318                lemma: check_word.to_string(),
319                number: if is_plural { Number::Plural } else { Number::Singular },
320            });
321        }
322    }
323
324    None
325}
326
327/// Check if a word is a known common noun or derivable from one.
328/// This is used for sentence-initial capitalization disambiguation.
329pub fn is_derivable_noun(word: &str) -> bool {
330    analyze_word(word).is_some()
331}
332
333/// Check if a word is a proper name (has Feature::Proper in the lexicon).
334/// Used to distinguish "Socrates fears death" from "Birds fly" (bare plurals).
335/// Names like "Socrates", "James", "Chris" end in 's' but aren't plural nouns.
336pub fn is_proper_name(word: &str) -> bool {
337    let lower = word.to_lowercase();
338    if let Some(meta) = lookup_noun_db(&lower) {
339        return meta.features.contains(&Feature::Proper);
340    }
341    false
342}
343
344/// Get the canonical lemma for a noun.
345///
346/// Maps inflected forms to their dictionary headword:
347/// - "men" → "Man"
348/// - "children" → "Child"
349/// - "farmers" → "Farmer"
350///
351/// This is used for predicate canonicalization in the proof engine,
352/// ensuring "All men are mortal" and "Socrates is a man" produce
353/// matching predicates.
354pub fn get_canonical_noun(word: &str) -> Option<&'static str> {
355    match analyze_word(word) {
356        Some(WordAnalysis::Noun(meta)) => Some(meta.lemma),
357        Some(WordAnalysis::DerivedNoun { .. }) => {
358            // Derived nouns (e.g., "blogger") return owned Strings,
359            // so we can't return a static reference.
360            // Fall back to raw word handling in the caller.
361            None
362        }
363        _ => None,
364    }
365}