Есть проблема с библиотекой rust_bert для Rust. не грузится токенизатор, если в папке с моделью нет vocab.txt. Если передать tokenizer.json, то будет ошибка, что вокаб неверный. Можно ли как-то это решить?
use tch::{Device, Kind};
use std::fmt;
use rust_bert::pipelines::sequence_classification::{
SequenceClassificationModel,
SequenceClassificationConfig,
};
use rust_bert::pipelines::common::{ModelType, ModelResource};
use rust_bert::resources::LocalResource;
pub struct Model {
model: SequenceClassificationModel,
path: std::path::PathBuf,
}
impl Model {
pub fn new(path: std::path::PathBuf, model_type: ModelType, allow_cuda: bool) -> Self {
let model = SequenceClassificationModel::new(
SequenceClassificationConfig {
model_type,
model_resource: ModelResource::Torch(Box::new(LocalResource::from(
path.join("model.safetensors")
))),
config_resource: Box::new(LocalResource::from(
path.join("config.json")
)),
vocab_resource: Box::new(LocalResource::from(
path.join("vocab.txt")
)),
merges_resource: None,
lower_case: false,
strip_accents: None,
add_prefix_space: None,
device: if allow_cuda { Device::cuda_if_available() } else { Device::Cpu },
kind: Some(Kind::Float),
}
).unwrap();
Self { model, path }
}
pub fn predict(&self, text: &str) -> i8 {
let input = vec![text];
let output = self.model.predict(input);
output.first().unwrap().id.try_into().unwrap()
}
}
impl fmt::Debug for Model {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Model")
.field("path", &self.path)
.finish_non_exhaustive()
}
}
vocab.txt:
thread 'main' panicked at src/nlp.rs:37:11:
called `Result::unwrap()` on an `Err` value: TokenizerError("File not found error: models/profiles_antispam_bert/vocab.txt vocabulary file not found :No such file or directory (os error 2)")
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
tokenizer.json:
thread 'main' panicked at src/nlp.rs:37:11:
called `Result::unwrap()` on an `Err` value: TokenizerError("Token not found in vocabulary: The special value [UNK] could not be found in the vocabulary")
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace