Je crois que le formateur et le classificateur permettent des générateurs de caractéristiques personnalisés dans leurs méthodes, mais ils doivent être des implémentations de FeatureGenerator, et BigramFeatureGenerator n'en est pas une. J'ai donc fait un impl rapide en tant que classe interne ci-dessous Essayez ce code (non testé) quand vous en aurez l'occasion.
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.doccat.DocumentSampleStream;
import opennlp.tools.doccat.FeatureGenerator;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
public class DoccatUsingBigram {
public static void main(String[] args) throws IOException {
InputStream dataIn = new FileInputStream(args[0]);
try {
ObjectStream<String> lineStream =
new PlainTextByLineStream(dataIn, "UTF-8");
//here you can use it as part of building the model
ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
DoccatModel model = DocumentCategorizerME.train("en", sampleStream, 10, 100, new MyBigramFeatureGenerator());
///now you would use it like this
DocumentCategorizerME classifier = new DocumentCategorizerME(model);
String[] someData = "whatever you are trying to classify".split(" ");
Collection<String> bigrams = new MyBigramFeatureGenerator().extractFeatures(someData);
double[] categorize = classifier.categorize(bigrams.toArray(new String[bigrams.size()]));
} catch (IOException e) {
// Failed to read or parse training data, training failed
e.printStackTrace();
}
}
public static class MyBigramFeatureGenerator implements FeatureGenerator {
@Override
public Collection<String> extractFeatures(String[] text) {
return generate(Arrays.asList(text), 2, "");
}
private List<String> generate(List<String> input, int n, String separator) {
List<String> outGrams = new ArrayList<String>();
for (int i = 0; i < input.size() - (n - 2); i++) {
String gram = "";
if ((i + n) <= input.size()) {
for (int x = i; x < (n + i); x++) {
gram += input.get(x) + separator;
}
gram = gram.substring(0, gram.lastIndexOf(separator));
outGrams.add(gram);
}
}
return outGrams;
}
}
}
J'espère que cela vous aidera...