Skip to content

fix: Drop unused TextPoolGenerator #141

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 21, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 1 addition & 202 deletions tpchgen/src/text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@
//!
//! <https://github.com/trinodb/tpch/blob/master/src/main/java/io/trino/tpch/TextPool.java>

use crate::{
distribution::{Distribution, Distributions},
random::RowRandomInt,
};
use crate::{distribution::Distributions, random::RowRandomInt};
use std::sync::OnceLock;

/// Pool of random text that follows TPC-H grammar.
Expand Down Expand Up @@ -157,201 +154,3 @@ impl TextPool {
}
}
}

#[derive(Debug)]
pub struct TextPoolGenerator {
size: usize,

grammars: ParsedDistribution,
noun_phrases: ParsedDistribution,
verb_phrases: ParsedDistribution,
prepositions: IndexedDistribution,
terminators: IndexedDistribution,
adverbs: IndexedDistribution,
verbs: IndexedDistribution,
auxiliaries: IndexedDistribution,
articles: IndexedDistribution,
adjectives: IndexedDistribution,
nouns: IndexedDistribution,
}

impl TextPoolGenerator {
const MAX_SENTENCE_LENGTH: usize = 256;

pub fn new(size: usize, distributions: &Distributions) -> Self {
TextPoolGenerator {
size,
grammars: ParsedDistribution::new(distributions.grammar()),
noun_phrases: ParsedDistribution::new(distributions.noun_phrase()),
verb_phrases: ParsedDistribution::new(distributions.verb_phrase()),
prepositions: IndexedDistribution::new(distributions.prepositions()),
terminators: IndexedDistribution::new(distributions.terminators()),
adverbs: IndexedDistribution::new(distributions.adverbs()),
verbs: IndexedDistribution::new(distributions.verbs()),
auxiliaries: IndexedDistribution::new(distributions.auxiliaries()),
articles: IndexedDistribution::new(distributions.articles()),
adjectives: IndexedDistribution::new(distributions.adjectives()),
nouns: IndexedDistribution::new(distributions.nouns()),
}
}

pub fn generate(&mut self) -> String {
let mut output = String::with_capacity(self.size + Self::MAX_SENTENCE_LENGTH);
let mut random_int = RowRandomInt::new(933588178, i32::MAX);

while output.len() < self.size {
self.generate_sentence(&mut output, &mut random_int);
}
output.truncate(self.size);
output
}

fn generate_sentence(&self, builder: &mut String, random: &mut RowRandomInt) {
let index = self.grammars.get_random_index(random);
for token in self.grammars.get_tokens(index) {
match token {
'V' => self.generate_verb_phrase(builder, random),
'N' => self.generate_noun_phrase(builder, random),
'P' => {
let preposition = self.prepositions.random_value(random);
builder.push_str(preposition);
builder.push_str(" the ");
self.generate_noun_phrase(builder, random);
}
'T' => {
// trim trailing space
// terminators should abut previous word
builder.pop();
let terminator = self.terminators.random_value(random);
builder.push_str(terminator);
}
_ => panic!("Unknown token '{}'", token),
}

if !builder.ends_with(' ') {
builder.push(' ');
}
}
}

fn generate_verb_phrase(&self, builder: &mut String, random: &mut RowRandomInt) {
let index = self.verb_phrases.get_random_index(random);
for token in self.verb_phrases.get_tokens(index) {
match token {
'D' => builder.push_str(self.adverbs.random_value(random)),
'V' => builder.push_str(self.verbs.random_value(random)),
'X' => builder.push_str(self.auxiliaries.random_value(random)),
_ => panic!("Unknown token '{}'", token),
}

// string may end with a comma or such
builder.push_str(self.verb_phrases.get_bonus_text(index));

// add a space
builder.push(' ');
}
}

fn generate_noun_phrase(&self, builder: &mut String, random: &mut RowRandomInt) {
let index = self.noun_phrases.get_random_index(random);
for token in self.noun_phrases.get_tokens(index) {
match token {
'A' => builder.push_str(self.articles.random_value(random)),
'J' => builder.push_str(self.adjectives.random_value(random)),
'D' => builder.push_str(self.adverbs.random_value(random)),
'N' => builder.push_str(self.nouns.random_value(random)),
_ => panic!("Unknown token '{}'", token),
}

// string may end with a comma or such
builder.push_str(self.noun_phrases.get_bonus_text(index));

// add a space
builder.push(' ');
}
}
}

#[derive(Debug)]
struct IndexedDistribution {
random_table: Vec<String>,
}

impl IndexedDistribution {
fn new(distribution: &Distribution) -> Self {
let max_weight = distribution.get_weight(distribution.size() - 1);
let mut random_table = vec![String::new(); max_weight as usize];

let mut value_index = 0;
for (i, item) in random_table.iter_mut().enumerate() {
if i >= distribution.get_weight(value_index) as usize {
value_index += 1;
}
*item = distribution.get_value(value_index).to_string();
}

IndexedDistribution { random_table }
}

fn random_value(&self, random: &mut RowRandomInt) -> &str {
let random_index = random.next_int(0, self.random_table.len() as i32 - 1) as usize;
&self.random_table[random_index]
}
}

#[derive(Debug)]
struct ParsedDistribution {
parsed_distribution: Vec<Vec<char>>,
bonus_text: Vec<String>,
random_table: Vec<usize>,
}

impl ParsedDistribution {
fn new(distribution: &Distribution) -> Self {
let size = distribution.size();
let mut parsed_distribution = Vec::with_capacity(size);
let mut bonus_text = Vec::with_capacity(size);

for i in 0..size {
let value = distribution.get_value(i);
let tokens: Vec<&str> = value.split_whitespace().collect();

let mut chars = Vec::with_capacity(tokens.len());
for token in &tokens {
chars.push(token.chars().next().unwrap());
bonus_text.push(token[1..].to_string());
}
parsed_distribution.push(chars);
}

let max_weight = distribution.get_weight(size - 1);
let mut random_table = vec![0; max_weight as usize];

let mut value_index = 0;
for (i, item) in random_table.iter_mut().enumerate() {
if i >= distribution.get_weight(value_index) as usize {
value_index += 1;
}
*item = value_index;
}

ParsedDistribution {
parsed_distribution,
bonus_text,
random_table,
}
}

fn get_random_index(&self, random: &mut RowRandomInt) -> usize {
let random_index = random.next_int(0, self.random_table.len() as i32 - 1) as usize;
self.random_table[random_index]
}

fn get_tokens(&self, index: usize) -> &[char] {
&self.parsed_distribution[index]
}

fn get_bonus_text(&self, index: usize) -> &str {
&self.bonus_text[index]
}
}
Loading