kafka-dev
diff --git a/‎README.md
Lines changed: 6 additions & 0 deletions b/‎README.md
Lines changed: 6 additions & 0 deletions
diff --git a/‎bin/kafka-console-consumer-log4j.properties
Lines changed: 7 additions & 0 deletions b/‎bin/kafka-console-consumer-log4j.properties
Lines changed: 7 additions & 0 deletions
diff --git a/‎bin/kafka-console-consumer.sh
Lines changed: 5 additions & 0 deletions b/‎bin/kafka-console-consumer.sh
Lines changed: 5 additions & 0 deletions
diff --git a/‎bin/kafka-run-class.sh
Lines changed: 1 addition & 5 deletions b/‎bin/kafka-run-class.sh
Lines changed: 1 addition & 5 deletions
diff --git a/‎build.xml
Lines changed: 70 additions & 6 deletions b/‎build.xml
Lines changed: 70 additions & 6 deletions
diff --git a/‎clients/python/kafka.py
Lines changed: 40 additions & 14 deletions b/‎clients/python/kafka.py
Lines changed: 40 additions & 14 deletions
diff --git a/‎config/server.properties
Lines changed: 1 addition & 1 deletion b/‎config/server.properties
Lines changed: 1 addition & 1 deletion
diff --git a/‎contrib/hadoop-consumer/README
Lines changed: 31 additions & 54 deletions b/‎contrib/hadoop-consumer/README
Lines changed: 31 additions & 54 deletions
@@ -12,3 +12,9 @@ Kafka is aimed at providing a publish-subscribe solution that can handle all act
 See our [web site](http://sna-projects.com/kafka) for more details on the project.
 
 Kafka is a new project, and we are interested in building the community; we would welcome any thoughts or patches. You can reach us [here](http://groups.google.com/group/kafka-dev). 
+
+To get kafka code:
+  git clone [email protected]:kafka-dev/kafka.git kafka
+
+To run unit tests:
+  ant test (you need to make sure that scala 2.8.0 is in your PATH)
@@ -0,0 +1,7 @@
+log4j.rootLogger=INFO, stderr
+
+log4j.appender.stderr=org.apache.log4j.ConsoleAppender
+log4j.appender.stderr.target=System.err
+log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
+log4j.appender.stderr.layout.ConversionPattern=[%d] %p %m (%c)%n
+
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+base_dir=$(dirname $0)
+export KAFKA_OPTS="-Xmx512M -server -Dcom.sun.management.jmxremote -Dlog4j.configuration=file:$base_dir/kafka-console-consumer-log4j.properties"
+$base_dir/kafka-run-class.sh kafka.consumer.ConsoleConsumer $@
@@ -8,8 +8,6 @@ fi
 
 base_dir=$(dirname $0)/..
 
-#CLASSPATH=$CLASSPATH:bin
-
 for file in $base_dir/dist/*.jar;
 do
   CLASSPATH=$CLASSPATH:$file
@@ -20,10 +18,8 @@ do
   CLASSPATH=$CLASSPATH:$file
 done
 
-CLASSPATH=dist:$CLASSPATH
-
 if [ -z "$KAFKA_OPTS" ]; then
-  KAFKA_OPTS="-Xmx512M -server -Dcom.sun.management.jmxremote"
+  KAFKA_OPTS="-Xmx512M -server -Dcom.sun.management.jmxremote -Dlog4j.configuration=file:$base_dir/dist/log4j.properties "
 fi
 
 if [ -z "$JAVA_HOME" ]; then
 
@@ -37,26 +37,56 @@
 
   <target name="all" depends="clean, jar" description="Build all artifacts." />
 
+  <target name="fsc" depends="clean, jar-fsc" description="Build all artifacts using the fast scala compiler." />
+
   <target name="clean" description="Delete generated files.">
     <delete dir="${dist.dir}" />
     <replace-dir dir="${javadoc.dir}" />
   </target>
 
-  <target name="build" depends="init" description="Compile main source tree">
+  <target name="build-fsc" depends="init" description="Compile main source tree using the fast scala compiler">
     <replace-dir dir="${classes.dir}"   />
     <replace-dir dir="${testclasses.dir}" />
-    <scalac srcdir="${src.dir}"
+    <fsc srcdir="${src.dir}"
   	        destdir="${classes.dir}"
             classpathref="main-classpath"
             force="changed"
     	    target="jvm-1.5">
       <include name="**/*.scala"   />
+    </fsc>
+    <fsc srcdir="${unittestsrc.dir}"
+            destdir="${testclasses.dir}"
+            classpathref="test-classpath"
+            force="changed"
+	        target="jvm-1.5">
+      <include name="**/*.scala"/>
+    </fsc>
+    <fsc srcdir="${othertestsrc.dir}"
+            destdir="${testclasses.dir}"
+            classpathref="test-classpath"
+            force="changed"
+            target="jvm-1.5">
+      <include name="**/*.scala"/>
+    </fsc>
+    <copy file="${src.dir}/log4j.properties" todir="${dist.dir}" />
+    <copy file="${test.dir}/log4j.properties" todir="${testclasses.dir}" />
+  </target>
+
+  <target name="build" depends="init" description="Compile main source tree">
+    <replace-dir dir="${classes.dir}"   />
+    <replace-dir dir="${testclasses.dir}" />
+    <scalac srcdir="${src.dir}"
+                destdir="${classes.dir}"
+            classpathref="main-classpath"
+            force="changed"
+            target="jvm-1.5">
+      <include name="**/*.scala"   />
     </scalac>
     <scalac srcdir="${unittestsrc.dir}"
             destdir="${testclasses.dir}"
             classpathref="test-classpath"
             force="changed"
-	        target="jvm-1.5">
+                target="jvm-1.5">
       <include name="**/*.scala"/>
     </scalac>
     <scalac srcdir="${othertestsrc.dir}"
@@ -66,8 +96,8 @@
             target="jvm-1.5">
       <include name="**/*.scala"/>
     </scalac>
-  	<copy file="${src.dir}/log4j.properties" todir="${dist.dir}" />
-  	<copy file="${test.dir}/log4j.properties" todir="${testclasses.dir}" />
+    <copy file="${src.dir}/log4j.properties" todir="${dist.dir}" />
+    <copy file="${test.dir}/log4j.properties" todir="${testclasses.dir}" />
   </target>
 
   <target name="jar" depends="build" description="Build jar file">
@@ -83,10 +113,44 @@
     </jar>
   </target>
 
+  <target name="jar-fsc" depends="build-fsc" description="Build jar file using the fast scala compiler">
+    <jar destfile="${dist.dir}/${name}-${curr.release}.jar">
+      <fileset dir="${classes.dir}">
+        <include name="**/*.*" />
+      </fileset>
+    </jar>
+    <jar destfile="${dist.dir}/${name}-test-${curr.release}.jar">
+      <fileset dir="${testclasses.dir}">
+        <include name="**/*.*" />
+      </fileset>
+    </jar>
+  </target>
+
   <target name="test" depends="jar" description="Run junit tests.">
     <replace-dir dir="${testreport.dir}" />
     <replace-dir dir="${testhtml.dir}" />
-    <junit printsummary="on" showoutput="false">
+    <junit printsummary="on" showoutput="true">
+      <classpath refid="test-classpath" />
+      <formatter type="xml" />
+      <batchtest fork="yes" todir="${testreport.dir}">
+        <fileset dir="${testclasses.dir}">
+          <include name="**/*Test.class" />
+          <exclude name="**/Abstract*.class" />
+        </fileset>
+      </batchtest>
+    </junit>
+    <junitreport todir="${testhtml.dir}">
+      <fileset dir="${testreport.dir}">
+        <include name="TEST-*.xml" />
+      </fileset>
+      <report todir="${testhtml.dir}" format="frames" />
+    </junitreport>
+  </target>
+
+  <target name="test-fsc" depends="jar-fsc" description="Run junit tests using the fast scala compiler">
+    <replace-dir dir="${testreport.dir}" />
+    <replace-dir dir="${testhtml.dir}" />
+    <junit printsummary="on" showoutput="true">
       <classpath refid="test-classpath" />
       <formatter type="xml" />
       <batchtest fork="yes" todir="${testreport.dir}">
 
@@ -14,29 +14,55 @@
 
 import socket
 import struct
+import binascii
+import sys
 
-class Message:
-    def __init__(self, topic, payload):
-        self.topic = topic
-        self.payload = payload
+PRODUCE_REQUEST_ID = 0
 
-    ## Message format is 4 byte length, 2 byte topic length, N byte topic and M byte payload
-    def encode(self):
-        return struct.pack('>i', len(self.payload) + len(self.topic) + 2) + \
-               struct.pack('>h', len(self.topic)) + self.topic + self.payload
+def encode_message(message):
+    # <MAGIC_BYTE: char> <CRC32: int> <PAYLOAD: bytes>
+    return struct.pack('>B', 0) + \
+           struct.pack('>i', binascii.crc32(message)) + \
+           message
+
+def encode_produce_request(topic, partition, messages):
+    # encode messages as <LEN: int><MESSAGE_BYTES>
+    encoded = [encode_message(message) for message in messages]
+    message_set = ''.join([struct.pack('>i', len(m)) + m for m in encoded])
+    
+    # create the request as <REQUEST_SIZE: int> <REQUEST_ID: short> <TOPIC: bytes> <PARTITION: int> <BUFFER_SIZE: int> <BUFFER: bytes>
+    data = struct.pack('>H', PRODUCE_REQUEST_ID) + \
+           struct.pack('>H', len(topic)) + topic + \
+           struct.pack('>i', partition) + \
+           struct.pack('>i', len(message_set)) + message_set
+    return struct.pack('>i', len(data)) + data
 
 
 class KafkaProducer:
-    def __init__(self, topic, host, port):
+    def __init__(self, host, port):
         self.REQUEST_KEY = 0
-        self.topic = topic
         self.connection = socket.socket()
         self.connection.connect((host, port))
 
     def close(self):
         self.connection.close()
 
-    def send(self, message):
-        encoded = message.encode()
-        self.connection.send(struct.pack('>i', len(encoded) + 2) + struct.pack('>h', self.REQUEST_KEY) + encoded)
-	
+    def send(self, messages, topic, partition = 0):
+        self.connection.sendall(encode_produce_request(topic, partition, messages))
+    
+if __name__ == '__main__':
+    if len(sys.argv) < 4:
+        print >> sys.stderr, 'USAGE: python', sys.argv[0], 'host port topic'
+        sys.exit(1)
+    host = sys.argv[1]
+    port = int(sys.argv[2])
+    topic = sys.argv[3]
+
+    producer = KafkaProducer(host, port)
+
+    while True:
+        print 'Enter comma seperated messages: ',
+        line = sys.stdin.readline()
+        messages = line.split(',')
+        producer.send(messages, topic)
+        print 'Sent', len(messages), 'messages successfully'
@@ -29,7 +29,7 @@ log.cleanup.interval.mins=1
 log.retention.hours=168
 
 #the number of messages to accept without flushing the log to disk
-log.flush.interval=1000
+log.flush.interval=1
 
 #set the following properties to use zookeeper
 
 
@@ -5,79 +5,56 @@ It requires the following inputs from a configuration file
 
 kafka.etl.topic : the topic to be fetched;
 
-kafka.nodes	: a hdfs file containing kafka nodes description; 
-		  test/kafka-nodes.txt is an exmple;
-
-kafka.etl.config: a hdfs file containing topic configuration;
-		  test/kafka-config.txt is an exmple;
-
-offsets.root	: input directory containing offsets in text format;
-		  they will be automatically generated in the first run;
+input		: input directory containing topic offsets and
+		  it can be generated by DataGenerator; 
 		  the number of files in this directory determines the
-		  number of mappers thus the number of fetch jobs;
+		  number of mappers in the hadoop job;
 
-output.root	: output directory containing kafka data and updated 
-		  offset files;
+output		: output directory containing kafka data and updated 
+		  topic offsets;
 
-partition.granularity: 
-		We assume all topic data contain a "time" field 
-		and we partition data based on specified granularity.
-		Accepted values are: minute/hour/day. Data in the 
-		same partition will go to the same reduce() call. 
+kafka.request.limit : it is used to limit the number events fetched. 
 
-KafkaETLJob is an abstract class which sets up job properties and 
-files Hadoop job. Users need to implement methods to provide Mapper
-and Reducer classes to be used. 
+KafkaETLRecordReader is a record reader associated with KafkaETLInputFormat.
+It fetches kafka data from the server. It starts from provided offsets 
+(specified by "input") and stops when it reaches the largest available offsets 
+or the specified limit (specified by "kafka.request.limit").
 
-KafkaETLMapper is an abstract class which fetches kafka data from 
-the server. It starts from provided offsets (by default, starts from
-smallest offsets available in server) and stops when it reaches 
-the largest available offsets. Users need to implement methods to 
-decode fetched data (to get timestamp field), and to determine job
-status and stopping conditions.  
+KafkaETLJob contains some helper functions to initialize job configuration.
 
-KafkaETLReducer is an abstract class which is used to partition 
-outputs. Users need to implement methods to generate output key 
-and value. They can also turn off partitioning by setting the 
-number of reducers to be 0.
+SimpleKafkaETLJob sets up job properties and files Hadoop job. 
 
-We include a simple implementation SimpleKafkaETLJob which fetches 
-test events (in text format) from server and store data in HDFS.
+SimpleKafkaETLMapper dumps kafka data into hdfs. 
 
 HOW TO RUN:
-1. Complile the code using "ant jar".
+1. Complile using "ant" and you will see kafka-etl-contrib-<version>.jar
+under $KAFKA_ROOT/dist.
 
-2. Generate test events in server:
+2. Produce test events in server and generate offset files
   1) Start kafka server [ Follow the quick start - 
                         http://sna-projects.com/kafka/quickstart.php ]
+
   2) Update test/test.properties to change the following parameters:  
    kafka.etl.topic 	: topic name
    event.count		: number of events to be generated
-   kafka.nodes		: hdfs location of kafka nodes configuration
-			  (test/kafka-nodes.txt is an example). 
-  3) Generate events 
-   ./bin/run-class.sh kafka.etl.impl.DataGenerator test/test.properties
- 
+   kafka.server.uri     : kafka server uri;
+   input                : hdfs directory of offset files
+
+  3) Produce test events to Kafka server and generate offset files
+   ./run-class.sh kafka.etl.impl.DataGenerator test/test.properties
 
 3. Fetch generated topic into HDFS:
   1) Update test/test.properties to change the following parameters:
 	hadoop.job.ugi	: id and group 
-	kafka.etl.config: hdfs location of kafka configuration file
-			  (test/kafka-config.txt is an example)
-	offsets.root    : input location (you can provide a hdfs dir
-			 where you have write permission and the job will
-			 automatically generate starting offset files)
-	output.root	: output location (please provide a hdfs dir
-			 where you have write permission)
-	partition.granularity : partition granularity
-
-  2) Fetch data
-  ./bin/run-class.sh kafka.etl.impl.SimpleKafkaETLJob test/test.properties
-
-
-
-
-
+	input           : input location 
+	output	        : output location 
+	kafka.request.limit: limit the number of events to be fetched; 
+			     -1 means no limitation.
+        hdfs.default.classpath.dir : hdfs location of jars
 
+  2) copy jars into hdfs
+   ./copy-jars.sh ${hdfs.default.classpath.dir}
 
+  2) Fetch data
+  ./run-class.sh kafka.etl.impl.SimpleKafkaETLJob test/test.properties