Using the Mahout Naive Bayes Classifier to automatically classify Twitter messages (part 2: distribute classification with hadoop)
2013/06/24 12 Comments
In this post, we are going to categorize the tweets by distributing the classification on the hadoop cluster. It can make the classification faster if there is a huge number of tweets to classify.
To go through this tutorial you would need to have run the commands in the post Using the Mahout Naive Bayes Classifier to automatically classify Twitter messages.
To distribute the classification on the hadoop nodes, we are going to define a mapreduce job:
- the csv containing the tweets to classify is split into several chunks
- each chunk is sent to the hadoop node that will process it by running the map class
- the map class loads the naive bayes model and some document/word frequency into memory
- for each tweet of the chunk, it computes the best matching category. The result is written in the output file. We are not using a reducer class as we don’t need to do aggregations.
To download the code used in this post, you can fetch it from github:
$ git clone https://github.com/fredang/mahout-naive-bayes-example2.git
To compile the project:
$ mvn clean package assembly:single
This repository contains the mapreduce job MapReduceClassifier.java:
public class MapReduceClassifier { public static class ClassifierMap extends Mapper<LongWritable, Text, Text, IntWritable> { private final static Text outputKey = new Text(); private final static IntWritable outputValue = new IntWritable(); private static Classifier classifier; @Override protected void setup(Context context) throws IOException { initClassifier(context); } private static void initClassifier(Context context) throws IOException { if (classifier == null) { synchronized (ClassifierMap.class) { if (classifier == null) { classifier = new Classifier(context.getConfiguration()); } } } } public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; int bestCategoryId = classifier.classify(tweet); outputValue.set(bestCategoryId); outputKey.set(tweetId); context.write(outputKey, outputValue); } } public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Arguments: [model] [dictionnary] [document frequency] [output directory]"); return; } String modelPath = args[0]; String dictionaryPath = args[1]; String documentFrequencyPath = args[2]; String tweetsPath = args[3]; String outputPath = args[4]; Configuration conf = new Configuration(); conf.setStrings(Classifier.MODEL_PATH_CONF, modelPath); conf.setStrings(Classifier.DICTIONARY_PATH_CONF, dictionaryPath); conf.setStrings(Classifier.DOCUMENT_FREQUENCY_PATH_CONF, documentFrequencyPath); // do not create a new jvm for each task conf.setLong("mapred.job.reuse.jvm.num.tasks", -1); Job job = new Job(conf, "classifier"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(ClassifierMap.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(tweetsPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.waitForCompletion(true); } }
It is using the classifier class:
public class Classifier { public final static String MODEL_PATH_CONF = "modelPath"; public final static String DICTIONARY_PATH_CONF = "dictionaryPath"; public final static String DOCUMENT_FREQUENCY_PATH_CONF = "documentFrequencyPath"; private static StandardNaiveBayesClassifier classifier; private static Map<String, Integer> dictionary; private static Map<Integer, Long> documentFrequency; private static Analyzer analyzer; public Classifier(Configuration configuration) throws IOException { String modelPath = configuration.getStrings(MODEL_PATH_CONF)[0]; String dictionaryPath = configuration.getStrings(DICTIONARY_PATH_CONF)[0]; String documentFrequencyPath = configuration.getStrings(DOCUMENT_FREQUENCY_PATH_CONF)[0]; dictionary = readDictionnary(configuration, new Path(dictionaryPath)); documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet analyzer = new DefaultAnalyzer(); NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); classifier = new StandardNaiveBayesClassifier(model); } public int classify(String text) throws IOException { int documentCount = documentFrequency.get(-1).intValue(); Multiset words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.reusableTokenStream("text", new StringReader(text)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry entry:words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for(Element element: resultVector) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } return bestCategoryId; } private static Map<String, Integer> readDictionnary(Configuration conf, Path dictionnaryPath) { Map<String, Integer> dictionnary = new HashMap<String, Integer>(); for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(dictionnaryPath, true, conf)) { dictionnary.put(pair.getFirst().toString(), pair.getSecond().get()); } return dictionnary; } private static Map<Integer, Long> readDocumentFrequency(Configuration conf, Path documentFrequencyPath) { Map<Integer, Long> documentFrequency = new HashMap<Integer, Long>(); for (Pair<IntWritable, LongWritable> pair : new SequenceFileIterable<IntWritable, LongWritable>(documentFrequencyPath, true, conf)) { documentFrequency.put(pair.getFirst().get(), pair.getSecond().get()); } return documentFrequency; } }
In this code, we are trying to minimize the number of times we load the naive bayes model in memory.
When the input file is split into chunk. It is distributed on all the nodes of the hadoop cluster. Each node will handle some chunks. For each of those chunk, Hadoop spawns a new jvm(process) instead of reusing an existing one. We can force hadoop to reuse an old one with this:
conf.setLong("mapred.job.reuse.jvm.num.tasks", -1);
In the jvm, we can put the data model somewhere in the memory and the subsequent tasks that uses the same JVM can use the model without having to reload it again. It is done by using a static attribute (see method MapReduceClassifier.initClassifier).
If you have run the commands in the previous post, you should have the following files in HDFS:
- tweets-vectors/dictionary.file-0
- tweets-vectors/df-count/part-r-00000
We would need to copy the file data/tweets-to-classify.tsv to HDFS so it can be read by the hadoop job:
$ hadoop fs -put data/tweets-to-classify.tsv tweets-to-classify.tsv
To run the mapreduce job:
$ hadoop jar target/mahout-naive-bayes-example2-1.0-jar-with-dependencies.jar model tweets-vectors/dictionary.file-0 tweets-vectors/df-count/part-r-00000 tweets-to-classify.tsv tweet-category
After it is done, we can copy the result from HDFS to the local filesystem:
$ hadoop fs -getmerge tweet-category tweet-category.tsv
Now we can see the results by using the ResultReader class:
public class ResultReader { public static Map<String, Integer> readCategoryByTweetIds(Configuration configuration, String tweetFileName) throws Exception { Map<String, Integer> categoryByTweetIds = new HashMap<String, Integer>(); BufferedReader reader = new BufferedReader(new FileReader(tweetFileName)); while(true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; Integer categoryId = Integer.parseInt(tokens[1]); categoryByTweetIds.put(tweetId, categoryId); } reader.close(); return categoryByTweetIds; } public static void main(String[] args) throws Exception { if (args.length < 3) { System.out.println("Arguments: [label index] "); return; } String tweetFileName = args[0]; String labelIndexPath = args[1]; String tweetCategoryIdsPath = args[2]; Configuration configuration = new Configuration(); Map<String, Integer> categoryByTweetIds = readCategoryByTweetIds(configuration, tweetCategoryIdsPath); Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); BufferedReader reader = new BufferedReader(new FileReader(tweetFileName)); while(true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; int categoryId = categoryByTweetIds.get(tweetId); System.out.println(tweetId + ": " + tweet); System.out.println(" => " + labels.get(categoryId)); } reader.close(); } }
$ java -cp target/mahout-naive-bayes-example2-1.0-jar-with-dependencies.jar com.chimpler.example.bayes2.ResultReader data/tweets-to-classify.tsv [label index path] tweet-category.tsv 309167856858308608: $13 for an iPhone 4/5 iPega Waterproof Case (Five Color Options) http://t.co/m7a5LHNw1J #DealYou #deal => tech 309167544130998272: #SAVE 18% Gored Crinkle #Skirt buy now from $44.95 #deal http://t.co/KKGfWVw5h3 => apparel 309167277155168257: Easy web hosting. $4.95 - http://t.co/0oUGS6Oj0e - Review/Coupon- http://t.co/zdgH4kv5sv #wordpress #deal #bluehost #blue host => tech 309167229054885888: Famous Footwear - 15% Off Sitewide http://t.co/vgmQxfJV4W #Deal - http://t.co/QImHB6xJ5b => apparel 309167212181221377: Team 32GB Class 10 SDHC Card for $17 + free shipping http://t.co/uD4yJgjRiK <- link #deal => tech 309166996174565376: ATTN #SINGERS: PLAY THIS CRAZY#SMASH. TAKEN: http://t.co/tNN88rMXHY. 4 FOR $15 #DEAL http://t.co/Yd7PdG6HzR => home 309166960803971072: Save 10% on PG Tips Tea http://t.co/eB2HkiK9CE #deal #cybermonday => health [...]
In this tutorial we have shown how to distribute the naive bayes classification using hadoop. We also have described how to speed up the execution of the job by minimizing the number of time the model is loaded into memory by using the hadoop property to reuse the same jvm. and by storing the data in memory using a static variable.
Hey Hi
this is Puneet Arora
While running the command I am getting the following errors please help
$ hadoop jar target/mahout-naive-bayes-example2-1.0-jar-with-dependencies.jar model tweets-vectors/dictionary.file-0 tweets-vectors/df-count/part-r-00000 tweets-to-classify.tsv tweet-category
13/06/27 12:56:02 INFO mapred.JobClient: Task Id : attempt_201306271152_0023_m_000000_0, Status : FAILED
java.lang.RuntimeException: java.lang.ClassNotFoundException: com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:849)
at org.apache.hadoop.mapreduce.JobContext.getMapperClass(JobContext.java:199)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:719)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1149)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Caused by: java.lang.ClassNotFoundException: com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap
at java.net.URLClassLoader$1.run(URLClassLoader.java:202)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:307)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:248)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:247)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:802)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:847)
… 8 more
13/06/27 12:56:07 INFO mapred.JobClient: Task Id : attempt_201306271152_0023_m_000000_1, Status : FAILED
java.lang.RuntimeException: java.lang.ClassNotFoundException: com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:849)
at org.apache.hadoop.mapreduce.JobContext.getMapperClass(JobContext.java:199)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:719)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1149)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Caused by: java.lang.ClassNotFoundException: com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap
at java.net.URLClassLoader$1.run(URLClassLoader.java:202)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:307)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:248)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:247)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:802)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:847)
… 8 more
13/06/27 12:56:13 INFO mapred.JobClient: Task Id : attempt_201306271152_0023_m_000000_2, Status : FAILED
java.lang.RuntimeException: java.lang.ClassNotFoundException: com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:849)
at org.apache.hadoop.mapreduce.JobContext.getMapperClass(JobContext.java:199)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:719)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1149)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Caused by: java.lang.ClassNotFoundException: com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap
at java.net.URLClassLoader$1.run(URLClassLoader.java:202)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:307)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:248)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:247)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:802)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:847)
… 8 more
Hi Puneet Arora,
I am not sure why you got this error and I cannot reproduce this error on my box.
By looking at http://stackoverflow.com/questions/1988561/question-on-hadoop-java-lang-runtimeexception-java-lang-classnotfoundexception?rq=1
you might try to:
1) Use the method setJarByClass:
2) Check that the class ClassifierMap is present in the jar:
3) put the class ClassifierMap in its own file ClassifierMap.java
Let us know if that helps.
Hi,
Thanks for your reply
But it didnt worked yet, yeah the now the previous errors are removed I used the method setjarbyclass
even check the class classifier map in the jar, yes it is there
but
I didnt get your last step where to put the classifiermap as there is no java file named ClassifierMap in the src folder.
Hi Puneet,
What error message do you get when you run the class now?
For the last step, you can move the class ClassifierMap which is defined in the MapReduceClassifier class:
into its own file ClassifierMap.java (you would need to create this file):
(note that this class is not defined as static now)
The reason is that it seems there is an issue with innerclass in your case, so putting the class ClassifierMap in its own file might solve the issue.
Let us know if that helps.
Pingback: Using the Mahout Naive Bayes Classifier to automatically classify Twitter messages | Chimpler
Hi,
Thanks for this wonderful exmaple, however I got the same inner class error above and I tried to fix it as you described, but I still did not work it out. The program can run but it seems it does not find the mapper, the program freeze at map 0% reduce 0% adn got killed after 10 mins
Is there anything that I missed?
Thanks in advance
Hii I am Sanjay Yadav I am getting the following error while running command:–
java -cp target/twitter-naive-bayes-example-1.0-jar-with-dependencies.jar com.chimpler.example.bayes.Classifier /home/hduser/mahout-naive-bayes-example/recieve/model /home/hduser/mahout-naive-bayes-example/recieve/labelindex /home/hduser/mahout-naive-bayes-example/recieve/dictionary.file-0 /home/hduser/mahout-naive-bayes-example/recieve/df-count /home/hduser/mahout-naive-bayes-example/my_data/newtweetsfinal.tsv
Error:
Exception in thread “main” java.lang.ArrayIndexOutOfBoundsException: 1
at com.chimpler.example.bayes.Classifier.main(Classifier.java:120)
Please Help me..
Hi,
I am getting below error when i run jar file thought i see eclipse does recogize the MutliSet class
13/09/02 20:42:04 INFO mapred.JobClient: Task Id : attempt_201309021921_0011_m_000000_0, Status : FAILED
Error: java.lang.ClassNotFoundException: com.google.common.collect.Multiset
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at MapReduceClassifier$ClassifierMap.initClassifier(MapReduceClassifier.java:39)
at MapReduceClassifier$ClassifierMap.setup(MapReduceClassifier.java:31)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:142)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:621)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:305)
at org.apache.hadoop.mapred.Child.main(Child.java:170)
Can you please help?
Hi,
Could you please help me why I am getting below error when i try to run the MapReduceClassifier (exported as jar)
13/09/07 20:58:51 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
13/09/07 20:58:52 INFO input.FileInputFormat: Total input paths to process : 1
13/09/07 20:58:52 INFO mapred.JobClient: Running job: job_201309071836_0006
13/09/07 20:58:53 INFO mapred.JobClient: map 0% reduce 0%
13/09/07 20:59:07 INFO mapred.JobClient: Task Id : attempt_201309071836_0006_m_000000_0, Status : FAILED
Error: java.lang.ClassNotFoundException: org.apache.mahout.math.Vector
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at MapReduceClassifier$ClassifierMap.initClassifier(MapReduceClassifier.java:39)
at MapReduceClassifier$ClassifierMap.setup(MapReduceClassifier.java:31)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:142)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:621)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:305)
at org.apache.hadoop.mapred.Child.main(Child.java:170)
I tried adding all .jars under ‘mahout-distribution-0.7’ folder to the HADOOP_CLASSPATH but not able to understand why my hadoop is throwing that it can’t recognize mahout libraries. (My eclipse doesn’t show any errors and recognizes the mahout classes just fine).
Hi,
I was also getting the same error as Puneet. After making the suggested changes, I am getting a different error. Can you please suggest how to resolve this issue.
Thanks,
Rohit.
3/09/22 19:26:11 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
13/09/22 19:26:11 INFO input.FileInputFormat: Total input paths to process : 1
13/09/22 19:26:11 INFO mapred.JobClient: Running job: job_201309210647_0047
13/09/22 19:26:12 INFO mapred.JobClient: map 0% reduce 0%
13/09/22 19:26:22 INFO mapred.JobClient: Task Id : attempt_201309210647_0047_m_000000_0, Status : FAILED
java.lang.NullPointerException
at com.chimpler.example.bayes2.Classifier.classify(Classifier.java:56)
at com.chimpler.example.bayes2.ClassifierMap.map(ClassifierMap.java:43)
at com.chimpler.example.bayes2.ClassifierMap.map(ClassifierMap.java:14)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:140)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:672)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:330)
at org.apache.hadoop.mapred.Child$4.run(Child.java:268)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1408)
at org.apache.hadoop.mapred.Child.main(Child.java:262)
hola yo obtengo este error podrian ayudarme
java.lang.IllegalArgumentException: Unknown flags set: %d [-101111]
at com.google.common.base.Preconditions.checkArgument(Preconditions.java:119)
at org.apache.mahout.math.VectorWritable.readFields(VectorWritable.java:88)
at org.apache.mahout.math.VectorWritable.readVector(VectorWritable.java:199)
at org.apache.mahout.classifier.naivebayes.NaiveBayesModel.materialize(NaiveBayesModel.java:112)
at com.chimpler.example.bayes2.Classifier.(Classifier.java:51)
at com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap.initClassifier(MapReduceClassifier.java:33)
at com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap.setup(MapReduceClassifier.java:26)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:142)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:214)
For null pointer exception modify readDocumentFrequency method as follows :
private static Map readDocumentFrequency(Configuration conf,
Path documentFrequencyPath) throws IOException {
Map documentFrequency = new HashMap();
FileStatus[] files = documentFrequencyPath.getFileSystem(conf)
.globStatus(documentFrequencyPath);
for (FileStatus file : files) {
System.out.println(“Adding filepath ” + file.getPath()
+ ” to documentFrequency map”);
for (Pair pair : new SequenceFileIterable(
file.getPath(), true, conf)) {
documentFrequency.put(pair.getFirst().get(), pair.getSecond()
.get());
}
}
return documentFrequency;
}
Also run the classifier as follows :
hadoop jar mahout-naive-bayes-example2-1.0-jar-with-dependencies.jar model tweets-vectors/dictionary.file-0 tweets-vectors/df-count/part-r-* tweets-to-classify.tsv tweet-category