Thursday, February 23, 2017

Writing ORC files is easier than a few years ago

Several years ago I was asked to compare writing Parquet and ORCFile formats from standalone java (without using the Hadoop libraries). At the time ORC was not separated from Hive and it was much more involved than writing Parquet from java. It looks like that changed in 2015 but I only revisited the issue within the past few months.

To build ORC:
Download the current release (currently 1.3.2)
tar xzvf orc-1.3.2.tar.gz && cd ./orc-1.3.2/
cd ./java
mvn package

ls -la ./tools/target/orc-tools-1.3.2-uber.jar

A simple example of writing is:

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.OrcFile;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
/*
* Basic code from https://orc.apache.org/docs/core-java.html#writing-orc-files
* Using Core Java - Writing ORC Files
*
*
* orc-tools-X.Y.Z-uber.jar is required in the runtime classpath for io/airlift/compress/Decompressor
*
* Creates myfile.orc AND .myfile.orc.crc, fails if myfile.orc exists.
*
* awcoleman@gmail.com
*/
public class WriteORCFileWORCCore {
public WriteORCFileWORCCore() throws IllegalArgumentException, IOException {
String outfilename = "/tmp/myfile.orc";
Configuration conf = new Configuration(false);
/*
* Writer is in orc-core-1.2.1.jar and has dependencies on the
* Hadoop HDFS client libs
*/
TypeDescription schema = TypeDescription.fromString("struct<x:int,y:int>");
Writer writer = OrcFile.createWriter(new Path(outfilename),
OrcFile.writerOptions(conf)
.setSchema(schema));
/*
* VectorizedRowBatch and LongColumnVector are in hive-storage-api-2.1.1-pre-orc.jar
*/
VectorizedRowBatch batch = schema.createRowBatch();
LongColumnVector x = (LongColumnVector) batch.cols[0];
LongColumnVector y = (LongColumnVector) batch.cols[1];
for(int r=0; r < 10000; ++r) {
int row = batch.size++;
x.vector[row] = r;
y.vector[row] = r * 3;
// If the batch is full, write it out and start over.
if (batch.size == batch.getMaxSize()) {
writer.addRowBatch(batch);
batch.reset();
}
}
//write last partial batch out
writer.addRowBatch(batch);
//close
writer.close();
//Output info to console
System.out.println("Wrote "+writer.getNumberOfRows()+" records to ORC file "+(new Path(outfilename).toString()));
}
public static void main(String[] args) throws IllegalArgumentException, IOException {
@SuppressWarnings("unused")
WriteORCFileWORCCore mainobj = new WriteORCFileWORCCore();
}
}

And a simple example of reading is:

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
/*
* Basic code from https://orc.apache.org/docs/core-java.html#reading-orc-files
* Using Core Java - Reading ORC FilesPermalink
*
* Reads ORC file written by WriteORCFileWORCCore
*
* orc-tools-X.Y.Z-uber.jar is required in the runtime classpath for io/airlift/compress/Decompressor
*
* awcoleman@gmail.com
*/
public class ReadORCFileWORCCore {
public ReadORCFileWORCCore() throws IllegalArgumentException, IOException {
String infilename = "/tmp/myfile.orc";
Configuration conf = new Configuration(false);
Reader reader = OrcFile.createReader(new Path(infilename),
OrcFile.readerOptions(conf));
RecordReader rows = reader.rows();
VectorizedRowBatch batch = reader.getSchema().createRowBatch();
//Some basic info about the ORC file
TypeDescription schema = reader.getSchema();
long numRecsInFile = reader.getNumberOfRows();
System.out.println("Reading ORC file "+(new Path(infilename).toString()));
System.out.println("ORC file schema: "+schema.toJson());
System.out.println("Number of records in ORC file: "+numRecsInFile);
while (rows.nextBatch(batch)) {
System.out.println("Processing Batch of records from ORC file. Number of records in Batch: "+batch.size);
LongColumnVector field1 = (LongColumnVector) batch.cols[0];
LongColumnVector field2 = (LongColumnVector) batch.cols[1];
for(int r=0; r < batch.size; ++r) {
int field1rowr = (int) field1.vector[r];
int field2rowr = (int) field2.vector[r];
System.out.println("In this batch, for row "+r+" in this batch, field1 is: "+field1rowr+" and field2 is: "+field2rowr);
}
}
rows.close();
}
public static void main(String[] args) throws IllegalArgumentException, IOException {
@SuppressWarnings("unused")
ReadORCFileWORCCore mainObj = new ReadORCFileWORCCore();
}
}