Пытаюсь запустить обучение с использованием mallet
model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
получаю ошибку
CalledProcessError: Command '/Users/username/mallet/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /var/folders/4z/5nc32mdj54ldk9f9znvpjw500000gn/T/c1ffd6_corpus.txt --output /var/folders/4z/5nc32mdj54ldk9f9znvpjw500000gn/T/c1ffd6_corpus.mallet' returned non-zero exit status 1.
запускаю эту строку в консоли, получаю
Error: Could not find or load main class cc.mallet.classify.tui.Csv2Vectors
Caused by: java.lang.ClassNotFoundException: cc.mallet.classify.tui.Csv2Vectors
cc/mallet/classify/tui/Csv2Vectors.java файл в этом месте лежит. Но видно нужно как то по-другому прописать скрипты что бы он его нашел. Я в разных вариантах попробовал, не получается.
код скрипта запуска малета прилагаю
#!/bin/bash
malletdir=`dirname $0`
malletdir=`dirname $malletdir`
echo $malletdir
# CLASSPATH=/Users/bobon/mallet/src/cc/mallet/classify/tui/
#изначальный cp
# cp=$malletdir/class:$malletdir/lib/mallet-deps.jar:$CLASSPATH
#echo $cp
#cp - я эксперементировал с путями
cp=$malletdir/:$malletdir/lib/mallet-deps.jar:$CLASSPATH
echo $cp
MEMORY=1g
CMD=$1
shift
help()
{
cat <<EOF
Mallet 2.0 commands:
import-dir load the contents of a directory into mallet instances (one per file)
import-file load a single file into mallet instances (one per line)
import-svmlight load SVMLight format data files into Mallet instances
info get information about Mallet instances
train-classifier train a classifier from Mallet data files
classify-dir classify the contents of a directory with a saved classifier
classify-file classify data from a single file with a saved classifier
classify-svmlight classify data from a single file in SVMLight format
train-topics train a topic model from Mallet data files
infer-topics use a trained topic model to infer topics for new documents
evaluate-topics estimate the probability of new documents under a trained model
prune remove features based on frequency or information gain
split divide data into testing, training, and validation portions
bulk-load for big input files, efficiently prune vocabulary and import docs
Include --help with any option for more information
EOF
}
CLASS=
case $CMD in
import-dir) CLASS=cc.mallet.classify.tui.Text2Vectors;;
import-file) CLASS=cc.mallet.classify.tui.Csv2Vectors;;
import-svmlight) CLASS=cc.mallet.classify.tui.SvmLight2Vectors;;
info) CLASS=cc.mallet.classify.tui.Vectors2Info;;
train-classifier) CLASS=cc.mallet.classify.tui.Vectors2Classify;;
classify-dir) CLASS=cc.mallet.classify.tui.Text2Classify;;
classify-file) CLASS=cc.mallet.classify.tui.Csv2Classify;;
classify-svmlight) CLASS=cc.mallet.classify.tui.SvmLight2Classify;;
train-topics) CLASS=cc.mallet.topics.tui.TopicTrainer;;
infer-topics) CLASS=cc.mallet.topics.tui.InferTopics;;
evaluate-topics) CLASS=cc.mallet.topics.tui.EvaluateTopics;;
prune) CLASS=cc.mallet.classify.tui.Vectors2Vectors;;
split) CLASS=cc.mallet.classify.tui.Vectors2Vectors;;
bulk-load) CLASS=cc.mallet.util.BulkLoader;;
run) CLASS=$1; shift;;
*) echo "Unrecognized command: $CMD"; help; exit 1;;
esac
echo "$cp" $CLASS
echo "$@"
java -Xmx$MEMORY -ea -Djava.awt.headless=true -Dfile.encoding=UTF-8 -server -classpath "$cp" $CLASS "$@"