To build ORC:
Download the current release (currently 1.3.2)
tar xzvf orc-1.3.2.tar.gz && cd ./orc-1.3.2/
cd ./java
mvn package
ls -la ./tools/target/orc-tools-1.3.2-uber.jar
A simple example of writing is:
A simple example of writing is:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.IOException; | |
import org.apache.hadoop.conf.Configuration; | |
import org.apache.hadoop.fs.Path; | |
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; | |
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; | |
import org.apache.orc.OrcFile; | |
import org.apache.orc.TypeDescription; | |
import org.apache.orc.Writer; | |
/* | |
* Basic code from https://orc.apache.org/docs/core-java.html#writing-orc-files | |
* Using Core Java - Writing ORC Files | |
* | |
* | |
* orc-tools-X.Y.Z-uber.jar is required in the runtime classpath for io/airlift/compress/Decompressor | |
* | |
* Creates myfile.orc AND .myfile.orc.crc, fails if myfile.orc exists. | |
* | |
* awcoleman@gmail.com | |
*/ | |
public class WriteORCFileWORCCore { | |
public WriteORCFileWORCCore() throws IllegalArgumentException, IOException { | |
String outfilename = "/tmp/myfile.orc"; | |
Configuration conf = new Configuration(false); | |
/* | |
* Writer is in orc-core-1.2.1.jar and has dependencies on the | |
* Hadoop HDFS client libs | |
*/ | |
TypeDescription schema = TypeDescription.fromString("struct<x:int,y:int>"); | |
Writer writer = OrcFile.createWriter(new Path(outfilename), | |
OrcFile.writerOptions(conf) | |
.setSchema(schema)); | |
/* | |
* VectorizedRowBatch and LongColumnVector are in hive-storage-api-2.1.1-pre-orc.jar | |
*/ | |
VectorizedRowBatch batch = schema.createRowBatch(); | |
LongColumnVector x = (LongColumnVector) batch.cols[0]; | |
LongColumnVector y = (LongColumnVector) batch.cols[1]; | |
for(int r=0; r < 10000; ++r) { | |
int row = batch.size++; | |
x.vector[row] = r; | |
y.vector[row] = r * 3; | |
// If the batch is full, write it out and start over. | |
if (batch.size == batch.getMaxSize()) { | |
writer.addRowBatch(batch); | |
batch.reset(); | |
} | |
} | |
//write last partial batch out | |
writer.addRowBatch(batch); | |
//close | |
writer.close(); | |
//Output info to console | |
System.out.println("Wrote "+writer.getNumberOfRows()+" records to ORC file "+(new Path(outfilename).toString())); | |
} | |
public static void main(String[] args) throws IllegalArgumentException, IOException { | |
@SuppressWarnings("unused") | |
WriteORCFileWORCCore mainobj = new WriteORCFileWORCCore(); | |
} | |
} |
And a simple example of reading is:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.IOException; | |
import org.apache.hadoop.conf.Configuration; | |
import org.apache.hadoop.fs.Path; | |
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; | |
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; | |
import org.apache.orc.OrcFile; | |
import org.apache.orc.Reader; | |
import org.apache.orc.RecordReader; | |
import org.apache.orc.TypeDescription; | |
/* | |
* Basic code from https://orc.apache.org/docs/core-java.html#reading-orc-files | |
* Using Core Java - Reading ORC FilesPermalink | |
* | |
* Reads ORC file written by WriteORCFileWORCCore | |
* | |
* orc-tools-X.Y.Z-uber.jar is required in the runtime classpath for io/airlift/compress/Decompressor | |
* | |
* awcoleman@gmail.com | |
*/ | |
public class ReadORCFileWORCCore { | |
public ReadORCFileWORCCore() throws IllegalArgumentException, IOException { | |
String infilename = "/tmp/myfile.orc"; | |
Configuration conf = new Configuration(false); | |
Reader reader = OrcFile.createReader(new Path(infilename), | |
OrcFile.readerOptions(conf)); | |
RecordReader rows = reader.rows(); | |
VectorizedRowBatch batch = reader.getSchema().createRowBatch(); | |
//Some basic info about the ORC file | |
TypeDescription schema = reader.getSchema(); | |
long numRecsInFile = reader.getNumberOfRows(); | |
System.out.println("Reading ORC file "+(new Path(infilename).toString())); | |
System.out.println("ORC file schema: "+schema.toJson()); | |
System.out.println("Number of records in ORC file: "+numRecsInFile); | |
while (rows.nextBatch(batch)) { | |
System.out.println("Processing Batch of records from ORC file. Number of records in Batch: "+batch.size); | |
LongColumnVector field1 = (LongColumnVector) batch.cols[0]; | |
LongColumnVector field2 = (LongColumnVector) batch.cols[1]; | |
for(int r=0; r < batch.size; ++r) { | |
int field1rowr = (int) field1.vector[r]; | |
int field2rowr = (int) field2.vector[r]; | |
System.out.println("In this batch, for row "+r+" in this batch, field1 is: "+field1rowr+" and field2 is: "+field2rowr); | |
} | |
} | |
rows.close(); | |
} | |
public static void main(String[] args) throws IllegalArgumentException, IOException { | |
@SuppressWarnings("unused") | |
ReadORCFileWORCCore mainObj = new ReadORCFileWORCCore(); | |
} | |
} |