clflushopt · kevinjqliu · Apr 21, 2025 · Apr 21, 2025
diff --git a/tpchgen/src/text.rs b/tpchgen/src/text.rs
@@ -5,10 +5,7 @@
 //!
 //! <https://github.com/trinodb/tpch/blob/master/src/main/java/io/trino/tpch/TextPool.java>
 
-use crate::{
-    distribution::{Distribution, Distributions},
-    random::RowRandomInt,
-};
+use crate::{distribution::Distributions, random::RowRandomInt};
 use std::sync::OnceLock;
 
 /// Pool of random text that follows TPC-H grammar.
@@ -157,201 +154,3 @@ impl TextPool {
         }
     }
 }
-
-#[derive(Debug)]
-pub struct TextPoolGenerator {
-    size: usize,
-
-    grammars: ParsedDistribution,
-    noun_phrases: ParsedDistribution,
-    verb_phrases: ParsedDistribution,
-    prepositions: IndexedDistribution,
-    terminators: IndexedDistribution,
-    adverbs: IndexedDistribution,
-    verbs: IndexedDistribution,
-    auxiliaries: IndexedDistribution,
-    articles: IndexedDistribution,
-    adjectives: IndexedDistribution,
-    nouns: IndexedDistribution,
-}
-
-impl TextPoolGenerator {
-    const MAX_SENTENCE_LENGTH: usize = 256;
-
-    pub fn new(size: usize, distributions: &Distributions) -> Self {
-        TextPoolGenerator {
-            size,
-            grammars: ParsedDistribution::new(distributions.grammar()),
-            noun_phrases: ParsedDistribution::new(distributions.noun_phrase()),
-            verb_phrases: ParsedDistribution::new(distributions.verb_phrase()),
-            prepositions: IndexedDistribution::new(distributions.prepositions()),
-            terminators: IndexedDistribution::new(distributions.terminators()),
-            adverbs: IndexedDistribution::new(distributions.adverbs()),
-            verbs: IndexedDistribution::new(distributions.verbs()),
-            auxiliaries: IndexedDistribution::new(distributions.auxiliaries()),
-            articles: IndexedDistribution::new(distributions.articles()),
-            adjectives: IndexedDistribution::new(distributions.adjectives()),
-            nouns: IndexedDistribution::new(distributions.nouns()),
-        }
-    }
-
-    pub fn generate(&mut self) -> String {
-        let mut output = String::with_capacity(self.size + Self::MAX_SENTENCE_LENGTH);
-        let mut random_int = RowRandomInt::new(933588178, i32::MAX);
-
-        while output.len() < self.size {
-            self.generate_sentence(&mut output, &mut random_int);
-        }
-        output.truncate(self.size);
-        output
-    }
-
-    fn generate_sentence(&self, builder: &mut String, random: &mut RowRandomInt) {
-        let index = self.grammars.get_random_index(random);
-        for token in self.grammars.get_tokens(index) {
-            match token {
-                'V' => self.generate_verb_phrase(builder, random),
-                'N' => self.generate_noun_phrase(builder, random),
-                'P' => {
-                    let preposition = self.prepositions.random_value(random);
-                    builder.push_str(preposition);
-                    builder.push_str(" the ");
-                    self.generate_noun_phrase(builder, random);
-                }
-                'T' => {
-                    // trim trailing space
-                    // terminators should abut previous word
-                    builder.pop();
-                    let terminator = self.terminators.random_value(random);
-                    builder.push_str(terminator);
-                }
-                _ => panic!("Unknown token '{}'", token),
-            }
-
-            if !builder.ends_with(' ') {
-                builder.push(' ');
-            }
-        }
-    }
-
-    fn generate_verb_phrase(&self, builder: &mut String, random: &mut RowRandomInt) {
-        let index = self.verb_phrases.get_random_index(random);
-        for token in self.verb_phrases.get_tokens(index) {
-            match token {
-                'D' => builder.push_str(self.adverbs.random_value(random)),
-                'V' => builder.push_str(self.verbs.random_value(random)),
-                'X' => builder.push_str(self.auxiliaries.random_value(random)),
-                _ => panic!("Unknown token '{}'", token),
-            }
-
-            // string may end with a comma or such
-            builder.push_str(self.verb_phrases.get_bonus_text(index));
-
-            // add a space
-            builder.push(' ');
-        }
-    }
-
-    fn generate_noun_phrase(&self, builder: &mut String, random: &mut RowRandomInt) {
-        let index = self.noun_phrases.get_random_index(random);
-        for token in self.noun_phrases.get_tokens(index) {
-            match token {
-                'A' => builder.push_str(self.articles.random_value(random)),
-                'J' => builder.push_str(self.adjectives.random_value(random)),
-                'D' => builder.push_str(self.adverbs.random_value(random)),
-                'N' => builder.push_str(self.nouns.random_value(random)),
-                _ => panic!("Unknown token '{}'", token),
-            }
-
-            // string may end with a comma or such
-            builder.push_str(self.noun_phrases.get_bonus_text(index));
-
-            // add a space
-            builder.push(' ');
-        }
-    }
-}
-
-#[derive(Debug)]
-struct IndexedDistribution {
-    random_table: Vec<String>,
-}
-
-impl IndexedDistribution {
-    fn new(distribution: &Distribution) -> Self {
-        let max_weight = distribution.get_weight(distribution.size() - 1);
-        let mut random_table = vec![String::new(); max_weight as usize];
-
-        let mut value_index = 0;
-        for (i, item) in random_table.iter_mut().enumerate() {
-            if i >= distribution.get_weight(value_index) as usize {
-                value_index += 1;
-            }
-            *item = distribution.get_value(value_index).to_string();
-        }
-
-        IndexedDistribution { random_table }
-    }
-
-    fn random_value(&self, random: &mut RowRandomInt) -> &str {
-        let random_index = random.next_int(0, self.random_table.len() as i32 - 1) as usize;
-        &self.random_table[random_index]
-    }
-}
-
-#[derive(Debug)]
-struct ParsedDistribution {
-    parsed_distribution: Vec<Vec<char>>,
-    bonus_text: Vec<String>,
-    random_table: Vec<usize>,
-}
-
-impl ParsedDistribution {
-    fn new(distribution: &Distribution) -> Self {
-        let size = distribution.size();
-        let mut parsed_distribution = Vec::with_capacity(size);
-        let mut bonus_text = Vec::with_capacity(size);
-
-        for i in 0..size {
-            let value = distribution.get_value(i);
-            let tokens: Vec<&str> = value.split_whitespace().collect();
-
-            let mut chars = Vec::with_capacity(tokens.len());
-            for token in &tokens {
-                chars.push(token.chars().next().unwrap());
-                bonus_text.push(token[1..].to_string());
-            }
-            parsed_distribution.push(chars);
-        }
-
-        let max_weight = distribution.get_weight(size - 1);
-        let mut random_table = vec![0; max_weight as usize];
-
-        let mut value_index = 0;
-        for (i, item) in random_table.iter_mut().enumerate() {
-            if i >= distribution.get_weight(value_index) as usize {
-                value_index += 1;
-            }
-            *item = value_index;
-        }
-
-        ParsedDistribution {
-            parsed_distribution,
-            bonus_text,
-            random_table,
-        }
-    }
-
-    fn get_random_index(&self, random: &mut RowRandomInt) -> usize {
-        let random_index = random.next_int(0, self.random_table.len() as i32 - 1) as usize;
-        self.random_table[random_index]
-    }
-
-    fn get_tokens(&self, index: usize) -> &[char] {
-        &self.parsed_distribution[index]
-    }
-
-    fn get_bonus_text(&self, index: usize) -> &str {
-        &self.bonus_text[index]
-    }
-}