logicaffeine_language/
mwe.rs1use std::collections::HashMap;
26use crate::token::{Token, TokenType};
27use crate::lexicon::{VerbClass, Time, Aspect};
28use logicaffeine_base::Interner;
29
30#[derive(Debug, Clone)]
31pub struct MweTarget {
32 pub lemma: &'static str,
33 pub pos: &'static str,
34 pub class: Option<VerbClass>,
35}
36
37#[derive(Default, Debug)]
38pub struct MweTrie {
39 pub children: HashMap<String, MweTrie>,
40 pub target: Option<MweTarget>,
41}
42
43impl MweTrie {
44 pub fn insert(&mut self, pattern: &[&str], target: MweTarget) {
45 if pattern.is_empty() {
46 self.target = Some(target);
47 return;
48 }
49 self.children
50 .entry(pattern[0].to_lowercase())
51 .or_default()
52 .insert(&pattern[1..], target);
53 }
54}
55
56pub fn apply_mwe_pipeline(
59 tokens: Vec<Token>,
60 trie: &MweTrie,
61 interner: &mut Interner,
62) -> Vec<Token> {
63 let mut result = Vec::new();
64 let mut i = 0;
65
66 while i < tokens.len() {
67 if let Some((match_len, target)) = find_longest_match(&tokens[i..], trie, interner) {
68 let merged = create_merged_token(&tokens[i], target, interner);
69 result.push(merged);
70 i += match_len;
71 } else {
72 result.push(tokens[i].clone());
73 i += 1;
74 }
75 }
76 result
77}
78
79fn get_lemma(token: &Token, interner: &Interner) -> String {
82 match &token.kind {
83 TokenType::Verb { lemma, .. } => interner.resolve(*lemma).to_lowercase(),
84 TokenType::Noun(sym) => interner.resolve(*sym).to_lowercase(),
85 TokenType::Adjective(sym) => interner.resolve(*sym).to_lowercase(),
86 TokenType::NonIntersectiveAdjective(sym) => interner.resolve(*sym).to_lowercase(),
87 TokenType::Preposition(sym) => interner.resolve(*sym).to_lowercase(),
88 TokenType::Particle(sym) => interner.resolve(*sym).to_lowercase(),
89 TokenType::Article(_) => interner.resolve(token.lexeme).to_lowercase(),
90 _ => interner.resolve(token.lexeme).to_lowercase(),
91 }
92}
93
94fn find_longest_match<'a>(
96 tokens: &[Token],
97 trie: &'a MweTrie,
98 interner: &Interner,
99) -> Option<(usize, &'a MweTarget)> {
100 let mut node = trie;
101 let mut best: Option<(usize, &MweTarget)> = None;
102
103 for (i, token) in tokens.iter().enumerate() {
104 let lemma = get_lemma(token, interner);
105 if let Some(child) = node.children.get(&lemma) {
106 node = child;
107 if let Some(target) = &node.target {
108 best = Some((i + 1, target));
109 }
110 } else {
111 break;
112 }
113 }
114 best
115}
116
117fn create_merged_token(head: &Token, target: &MweTarget, interner: &mut Interner) -> Token {
119 let lemma_sym = interner.intern(target.lemma);
120
121 let kind = match target.pos {
122 "Noun" => TokenType::Noun(lemma_sym),
123 "Verb" => {
124 let (time, aspect) = match &head.kind {
125 TokenType::Verb { time, aspect, .. } => (*time, *aspect),
126 _ => (Time::Present, Aspect::Simple),
127 };
128 TokenType::Verb {
129 lemma: lemma_sym,
130 time,
131 aspect,
132 class: target.class.unwrap_or(VerbClass::Activity),
133 }
134 }
135 "Preposition" => TokenType::Preposition(lemma_sym),
136 "Conjunction" => TokenType::And,
137 "Quantifier" => TokenType::NoOne,
138 _ => TokenType::Noun(lemma_sym),
139 };
140
141 Token {
142 kind,
143 lexeme: lemma_sym,
144 span: head.span,
145 }
146}
147
148include!(concat!(env!("OUT_DIR"), "/mwe_data.rs"));