Using the Mahout Naive Bayes Classifier to automatically classify Twitter messages (part 2: distribute classification with hadoop)

2013/06/24 12 Comments

In this post, we are going to categorize the tweets by distributing the classification on the hadoop cluster. It can make the classification faster if there is a huge number of tweets to classify.

To go through this tutorial you would need to have run the commands in the post Using the Mahout Naive Bayes Classifier to automatically classify Twitter messages.

To distribute the classification on the hadoop nodes, we are going to define a mapreduce job:

the csv containing the tweets to classify is split into several chunks
each chunk is sent to the hadoop node that will process it by running the map class
the map class loads the naive bayes model and some document/word frequency into memory
for each tweet of the chunk, it computes the best matching category. The result is written in the output file. We are not using a reducer class as we don’t need to do aggregations.

To download the code used in this post, you can fetch it from github:

$ git clone https://github.com/fredang/mahout-naive-bayes-example2.git

To compile the project:

$ mvn clean package assembly:single

This repository contains the mapreduce job MapReduceClassifier.java:

public class MapReduceClassifier {

	public static class ClassifierMap extends Mapper<LongWritable, Text, Text, IntWritable> {
		private final static Text outputKey = new Text();
		private final static IntWritable outputValue = new IntWritable();
		private static Classifier classifier;

		@Override
		protected void setup(Context context) throws IOException {
			initClassifier(context);
		}

		private static void initClassifier(Context context) throws IOException {
			if (classifier == null) {
				synchronized (ClassifierMap.class) {
					if (classifier == null) {
						classifier = new Classifier(context.getConfiguration());
					}
				}
			}
		}

		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String line = value.toString();
			String[] tokens = line.split("\t", 2);
			String tweetId = tokens[0];
			String tweet = tokens[1];

			int bestCategoryId = classifier.classify(tweet);
			outputValue.set(bestCategoryId);

			outputKey.set(tweetId);
			context.write(outputKey, outputValue);
		}
	}

	public static void main(String[] args) throws Exception {
		if (args.length < 5) {
			System.out.println("Arguments: [model] [dictionnary] [document frequency]  [output directory]");
			return;
		}
		String modelPath = args[0];
		String dictionaryPath = args[1];
		String documentFrequencyPath = args[2];
		String tweetsPath = args[3];
		String outputPath = args[4];

		Configuration conf = new Configuration();

		conf.setStrings(Classifier.MODEL_PATH_CONF, modelPath);
		conf.setStrings(Classifier.DICTIONARY_PATH_CONF, dictionaryPath);
		conf.setStrings(Classifier.DOCUMENT_FREQUENCY_PATH_CONF, documentFrequencyPath);

		// do not create a new jvm for each task
		conf.setLong("mapred.job.reuse.jvm.num.tasks", -1);

		Job job = new Job(conf, "classifier");

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		job.setMapperClass(ClassifierMap.class);

		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);

		FileInputFormat.addInputPath(job, new Path(tweetsPath));
		FileOutputFormat.setOutputPath(job, new Path(outputPath));

		job.waitForCompletion(true);
	}
}

It is using the classifier class:

public class Classifier {
	public final static String MODEL_PATH_CONF = "modelPath";
	public final static String DICTIONARY_PATH_CONF = "dictionaryPath";
	public final static String DOCUMENT_FREQUENCY_PATH_CONF = "documentFrequencyPath";

	private static StandardNaiveBayesClassifier classifier;
	private static Map<String, Integer> dictionary;
	private static Map<Integer, Long> documentFrequency;
	private static Analyzer analyzer;

	public Classifier(Configuration configuration) throws IOException {
		String modelPath = configuration.getStrings(MODEL_PATH_CONF)[0];
		String dictionaryPath = configuration.getStrings(DICTIONARY_PATH_CONF)[0];
		String documentFrequencyPath = configuration.getStrings(DOCUMENT_FREQUENCY_PATH_CONF)[0];

		dictionary = readDictionnary(configuration, new Path(dictionaryPath));
		documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath));

		// analyzer used to extract word from tweet
		analyzer = new DefaultAnalyzer();

		NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

		classifier = new StandardNaiveBayesClassifier(model);
	}

	public int classify(String text) throws IOException {
		int documentCount = documentFrequency.get(-1).intValue();

		Multiset words = ConcurrentHashMultiset.create();

		// extract words from tweet
		TokenStream ts = analyzer.reusableTokenStream("text", new StringReader(text));
		CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
		ts.reset();
		int wordCount = 0;
		while (ts.incrementToken()) {
			if (termAtt.length() > 0) {
				String word = ts.getAttribute(CharTermAttribute.class).toString();
				Integer wordId = dictionary.get(word);
				// if the word is not in the dictionary, skip it
				if (wordId != null) {
					words.add(word);
					wordCount++;
				}
			}
		}

		// create vector wordId => weight using tfidf
		Vector vector = new RandomAccessSparseVector(10000);
		TFIDF tfidf = new TFIDF();
		for (Multiset.Entry entry:words.entrySet()) {
			String word = entry.getElement();
			int count = entry.getCount();
			Integer wordId = dictionary.get(word);
			Long freq = documentFrequency.get(wordId);
			double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
			vector.setQuick(wordId, tfIdfValue);
		}
		// With the classifier, we get one score for each label
		// The label with the highest score is the one the tweet is more likely to
		// be associated to
		Vector resultVector = classifier.classifyFull(vector);
		double bestScore = -Double.MAX_VALUE;
		int bestCategoryId = -1;
		for(Element element: resultVector) {
			int categoryId = element.index();
			double score = element.get();
			if (score > bestScore) {
				bestScore = score;
				bestCategoryId = categoryId;
			}
		}

		return bestCategoryId;
	}

	private static Map<String, Integer> readDictionnary(Configuration conf, Path dictionnaryPath) {
		Map<String, Integer> dictionnary = new HashMap<String, Integer>();
		for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(dictionnaryPath, true, conf)) {
			dictionnary.put(pair.getFirst().toString(), pair.getSecond().get());
		}
		return dictionnary;
	}

	private static Map<Integer, Long> readDocumentFrequency(Configuration conf, Path documentFrequencyPath) {
		Map<Integer, Long> documentFrequency = new HashMap<Integer, Long>();
		for (Pair<IntWritable, LongWritable> pair : new SequenceFileIterable<IntWritable, LongWritable>(documentFrequencyPath, true, conf)) {
			documentFrequency.put(pair.getFirst().get(), pair.getSecond().get());
		}
		return documentFrequency;
	}

}

In this code, we are trying to minimize the number of times we load the naive bayes model in memory.

When the input file is split into chunk. It is distributed on all the nodes of the hadoop cluster. Each node will handle some chunks. For each of those chunk, Hadoop spawns a new jvm(process) instead of reusing an existing one. We can force hadoop to reuse an old one with this:


conf.setLong("mapred.job.reuse.jvm.num.tasks", -1);

In the jvm, we can put the data model somewhere in the memory and the subsequent tasks that uses the same JVM can use the model without having to reload it again. It is done by using a static attribute (see method MapReduceClassifier.initClassifier).

If you have run the commands in the previous post, you should have the following files in HDFS:

tweets-vectors/dictionary.file-0
tweets-vectors/df-count/part-r-00000

We would need to copy the file data/tweets-to-classify.tsv to HDFS so it can be read by the hadoop job:

$ hadoop fs -put data/tweets-to-classify.tsv tweets-to-classify.tsv

To run the mapreduce job:

$ hadoop jar target/mahout-naive-bayes-example2-1.0-jar-with-dependencies.jar model tweets-vectors/dictionary.file-0 tweets-vectors/df-count/part-r-00000 tweets-to-classify.tsv tweet-category

After it is done, we can copy the result from HDFS to the local filesystem:

$ hadoop fs -getmerge tweet-category tweet-category.tsv

Now we can see the results by using the ResultReader class:

public class ResultReader {

	public static Map<String, Integer> readCategoryByTweetIds(Configuration configuration, String tweetFileName) throws Exception {
		Map<String, Integer> categoryByTweetIds = new HashMap<String, Integer>();
		BufferedReader reader = new BufferedReader(new FileReader(tweetFileName));
		while(true) {
			String line = reader.readLine();
			if (line == null) {
				break;
			}
			String[] tokens = line.split("\t", 2);
			String tweetId = tokens[0];
			Integer categoryId = Integer.parseInt(tokens[1]);
			categoryByTweetIds.put(tweetId, categoryId);
		}
		reader.close();
		return categoryByTweetIds;
	}

	public static void main(String[] args) throws Exception {
		if (args.length < 3) {
			System.out.println("Arguments:  [label index] ");
			return;
		}

		String tweetFileName = args[0];
		String labelIndexPath = args[1];
		String tweetCategoryIdsPath = args[2];

		Configuration configuration = new Configuration();

		Map<String, Integer> categoryByTweetIds = readCategoryByTweetIds(configuration, tweetCategoryIdsPath);
		Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));

		BufferedReader reader = new BufferedReader(new FileReader(tweetFileName));
		while(true) {
			String line = reader.readLine();
			if (line == null) {
				break;
			}
			String[] tokens = line.split("\t", 2);
			String tweetId = tokens[0];
			String tweet = tokens[1];
			int categoryId = categoryByTweetIds.get(tweetId);
			System.out.println(tweetId + ": " + tweet);
			System.out.println(" => " + labels.get(categoryId));
		}
		reader.close();

	}
}

$ java -cp target/mahout-naive-bayes-example2-1.0-jar-with-dependencies.jar com.chimpler.example.bayes2.ResultReader data/tweets-to-classify.tsv [label index path] tweet-category.tsv

309167856858308608: $13 for an iPhone 4/5 iPega Waterproof Case (Five Color Options) http://t.co/m7a5LHNw1J #DealYou #deal
=> tech
309167544130998272: #SAVE 18% Gored Crinkle #Skirt buy now from $44.95 #deal http://t.co/KKGfWVw5h3
=> apparel
309167277155168257: Easy web hosting. $4.95 - http://t.co/0oUGS6Oj0e - Review/Coupon- http://t.co/zdgH4kv5sv #wordpress #deal #bluehost #blue host
=> tech
309167229054885888: Famous Footwear - 15% Off Sitewide http://t.co/vgmQxfJV4W #Deal - http://t.co/QImHB6xJ5b
=> apparel
309167212181221377: Team 32GB Class 10 SDHC Card for $17 + free shipping http://t.co/uD4yJgjRiK &lt;- link #deal
=> tech
309166996174565376: ATTN #SINGERS: PLAY THIS CRAZY#SMASH. TAKEN: http://t.co/tNN88rMXHY. 4 FOR $15 #DEAL http://t.co/Yd7PdG6HzR
=> home
309166960803971072: Save 10% on PG Tips Tea http://t.co/eB2HkiK9CE #deal #cybermonday
=> health

[...]

In this tutorial we have shown how to distribute the naive bayes classification using hadoop. We also have described how to speed up the execution of the job by minimizing the number of time the model is loaded into memory by using the hadoop property to reuse the same jvm. and by storing the data in memory using a static variable.

Filed under hadoop, machine learning, mahout Tagged with hadoop, machine learning, Naive Bayes

About chimpler
http://www.chimpler.com

12 Responses to Using the Mahout Naive Bayes Classifier to automatically classify Twitter messages (part 2: distribute classification with hadoop)

Puneet arora says:

2013/06/27 at 3:29 am

Hey Hi
this is Puneet Arora

While running the command I am getting the following errors please help
$ hadoop jar target/mahout-naive-bayes-example2-1.0-jar-with-dependencies.jar model tweets-vectors/dictionary.file-0 tweets-vectors/df-count/part-r-00000 tweets-to-classify.tsv tweet-category

13/06/27 12:56:02 INFO mapred.JobClient: Task Id : attempt_201306271152_0023_m_000000_0, Status : FAILED
java.lang.RuntimeException: java.lang.ClassNotFoundException: com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:849)
at org.apache.hadoop.mapreduce.JobContext.getMapperClass(JobContext.java:199)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:719)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1149)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Caused by: java.lang.ClassNotFoundException: com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap
at java.net.URLClassLoader$1.run(URLClassLoader.java:202)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:307)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:248)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:247)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:802)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:847)
… 8 more

13/06/27 12:56:07 INFO mapred.JobClient: Task Id : attempt_201306271152_0023_m_000000_1, Status : FAILED
java.lang.RuntimeException: java.lang.ClassNotFoundException: com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:849)
at org.apache.hadoop.mapreduce.JobContext.getMapperClass(JobContext.java:199)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:719)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1149)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Caused by: java.lang.ClassNotFoundException: com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap
at java.net.URLClassLoader$1.run(URLClassLoader.java:202)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:307)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:248)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:247)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:802)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:847)
… 8 more

13/06/27 12:56:13 INFO mapred.JobClient: Task Id : attempt_201306271152_0023_m_000000_2, Status : FAILED
java.lang.RuntimeException: java.lang.ClassNotFoundException: com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:849)
at org.apache.hadoop.mapreduce.JobContext.getMapperClass(JobContext.java:199)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:719)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1149)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Caused by: java.lang.ClassNotFoundException: com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap
at java.net.URLClassLoader$1.run(URLClassLoader.java:202)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:307)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:248)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:247)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:802)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:847)
… 8 more

Reply
- chimpler says:
  
  2013/06/28 at 8:10 pm
  Hi Puneet Arora,
  
  I am not sure why you got this error and I cannot reproduce this error on my box.
  By looking at http://stackoverflow.com/questions/1988561/question-on-hadoop-java-lang-runtimeexception-java-lang-classnotfoundexception?rq=1
  you might try to:
  1) Use the method setJarByClass:
```
Job job = new Job(conf, "classifier");
job.setJarByClass(MapReduceClassifier.class);
```
  2) Check that the class ClassifierMap is present in the jar:
```
jar tvf target/mahout-naive-bayes-example2-1.0-jar-with-dependencies.jar | grep ClassifierMap
```
  3) put the class ClassifierMap in its own file ClassifierMap.java
  
  Let us know if that helps.
  Reply
  - Puneet arora says:
    
    2013/07/01 at 1:58 am
    
    Hi,
    Thanks for your reply
    
    But it didnt worked yet, yeah the now the previous errors are removed I used the method setjarbyclass
    even check the class classifier map in the jar, yes it is there
    but
    I didnt get your last step where to put the classifiermap as there is no java file named ClassifierMap in the src folder.
  - chimpler says:
    
    2013/07/01 at 9:12 am
    Hi Puneet,
    
    What error message do you get when you run the class now?
    
    For the last step, you can move the class ClassifierMap which is defined in the MapReduceClassifier class:
    
    public class MapReduceClassifier { public static class ClassifierMap extends Mapper<LongWritable, Text, Text, IntWritable> { [...] } [...] }
    
    into its own file ClassifierMap.java (you would need to create this file):
    
    public class ClassifierMap extends Mapper<LongWritable, Text, Text, IntWritable> { [...] }
    
    (note that this class is not defined as static now)
    The reason is that it seems there is an issue with innerclass in your case, so putting the class ClassifierMap in its own file might solve the issue.
    
    Let us know if that helps.
Pingback: Using the Mahout Naive Bayes Classifier to automatically classify Twitter messages | Chimpler
Xilun says:

2013/07/17 at 8:30 pm

Hi,

Thanks for this wonderful exmaple, however I got the same inner class error above and I tried to fix it as you described, but I still did not work it out. The program can run but it seems it does not find the mapper, the program freeze at map 0% reduce 0% adn got killed after 10 mins

Is there anything that I missed?

Thanks in advance

Reply
Sanjay Yadav says:

2013/08/05 at 5:35 am

Hii I am Sanjay Yadav I am getting the following error while running command:–
java -cp target/twitter-naive-bayes-example-1.0-jar-with-dependencies.jar com.chimpler.example.bayes.Classifier /home/hduser/mahout-naive-bayes-example/recieve/model /home/hduser/mahout-naive-bayes-example/recieve/labelindex /home/hduser/mahout-naive-bayes-example/recieve/dictionary.file-0 /home/hduser/mahout-naive-bayes-example/recieve/df-count /home/hduser/mahout-naive-bayes-example/my_data/newtweetsfinal.tsv

Error:

Exception in thread “main” java.lang.ArrayIndexOutOfBoundsException: 1
at com.chimpler.example.bayes.Classifier.main(Classifier.java:120)

Please Help me..

Reply
Suresh Kumar M V says:

2013/09/02 at 11:54 pm

Hi,
I am getting below error when i run jar file thought i see eclipse does recogize the MutliSet class

13/09/02 20:42:04 INFO mapred.JobClient: Task Id : attempt_201309021921_0011_m_000000_0, Status : FAILED
Error: java.lang.ClassNotFoundException: com.google.common.collect.Multiset
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at MapReduceClassifier$ClassifierMap.initClassifier(MapReduceClassifier.java:39)
at MapReduceClassifier$ClassifierMap.setup(MapReduceClassifier.java:31)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:142)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:621)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:305)
at org.apache.hadoop.mapred.Child.main(Child.java:170)

Can you please help?

Reply
Deepthi says:

2013/09/08 at 12:07 am

Hi,
Could you please help me why I am getting below error when i try to run the MapReduceClassifier (exported as jar)

13/09/07 20:58:51 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
13/09/07 20:58:52 INFO input.FileInputFormat: Total input paths to process : 1
13/09/07 20:58:52 INFO mapred.JobClient: Running job: job_201309071836_0006
13/09/07 20:58:53 INFO mapred.JobClient: map 0% reduce 0%
13/09/07 20:59:07 INFO mapred.JobClient: Task Id : attempt_201309071836_0006_m_000000_0, Status : FAILED
Error: java.lang.ClassNotFoundException: org.apache.mahout.math.Vector
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at MapReduceClassifier$ClassifierMap.initClassifier(MapReduceClassifier.java:39)
at MapReduceClassifier$ClassifierMap.setup(MapReduceClassifier.java:31)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:142)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:621)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:305)
at org.apache.hadoop.mapred.Child.main(Child.java:170)

I tried adding all .jars under ‘mahout-distribution-0.7’ folder to the HADOOP_CLASSPATH but not able to understand why my hadoop is throwing that it can’t recognize mahout libraries. (My eclipse doesn’t show any errors and recognizes the mahout classes just fine).

Reply
Rohit says:

2013/09/22 at 10:30 pm

Hi,

I was also getting the same error as Puneet. After making the suggested changes, I am getting a different error. Can you please suggest how to resolve this issue.

Thanks,
Rohit.

3/09/22 19:26:11 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
13/09/22 19:26:11 INFO input.FileInputFormat: Total input paths to process : 1
13/09/22 19:26:11 INFO mapred.JobClient: Running job: job_201309210647_0047
13/09/22 19:26:12 INFO mapred.JobClient: map 0% reduce 0%
13/09/22 19:26:22 INFO mapred.JobClient: Task Id : attempt_201309210647_0047_m_000000_0, Status : FAILED
java.lang.NullPointerException
at com.chimpler.example.bayes2.Classifier.classify(Classifier.java:56)
at com.chimpler.example.bayes2.ClassifierMap.map(ClassifierMap.java:43)
at com.chimpler.example.bayes2.ClassifierMap.map(ClassifierMap.java:14)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:140)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:672)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:330)
at org.apache.hadoop.mapred.Child$4.run(Child.java:268)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1408)
at org.apache.hadoop.mapred.Child.main(Child.java:262)

Reply
pabdany says:

2014/07/04 at 2:00 pm

hola yo obtengo este error podrian ayudarme

java.lang.IllegalArgumentException: Unknown flags set: %d [-101111]
at com.google.common.base.Preconditions.checkArgument(Preconditions.java:119)
at org.apache.mahout.math.VectorWritable.readFields(VectorWritable.java:88)
at org.apache.mahout.math.VectorWritable.readVector(VectorWritable.java:199)
at org.apache.mahout.classifier.naivebayes.NaiveBayesModel.materialize(NaiveBayesModel.java:112)
at com.chimpler.example.bayes2.Classifier.(Classifier.java:51)
at com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap.initClassifier(MapReduceClassifier.java:33)
at com.chimpler.example.bayes2.MapReduceClassifier$ClassifierMap.setup(MapReduceClassifier.java:26)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:142)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:214)

Reply
Skanda Prasad says:

2014/10/13 at 9:54 am

For null pointer exception modify readDocumentFrequency method as follows :

private static Map readDocumentFrequency(Configuration conf,
Path documentFrequencyPath) throws IOException {
Map documentFrequency = new HashMap();

FileStatus[] files = documentFrequencyPath.getFileSystem(conf)
.globStatus(documentFrequencyPath);

for (FileStatus file : files) {
System.out.println(“Adding filepath ” + file.getPath()
+ ” to documentFrequency map”);
for (Pair pair : new SequenceFileIterable(
file.getPath(), true, conf)) {
documentFrequency.put(pair.getFirst().get(), pair.getSecond()
.get());
}
}

return documentFrequency;
}

Also run the classifier as follows :

hadoop jar mahout-naive-bayes-example2-1.0-jar-with-dependencies.jar model tweets-vectors/dictionary.file-0 tweets-vectors/df-count/part-r-* tweets-to-classify.tsv tweet-category

Reply