5 files changed, 416 insertions, 0 deletions
diff --git a/pbf2db/binarystream.py b/pbf2db/binarystream.py
new file mode 100644
index 0000000..e1e0280
--- /dev/null
+++ b/pbf2db/binarystream.py
@@ -0,0 +1,95 @@
+from struct import *
+
+class BinaryStream:
+    def __init__(self, base_stream):
+        self.base_stream = base_stream
+
+    def readByte(self):
+        return self.base_stream.read(1)
+
+    def readBytes(self, length):
+        return self.base_stream.read(length)
+
+    def readChar(self):
+        return self.unpack('b')
+
+    def readUChar(self):
+        return self.unpack('B')
+
+    def readBool(self):
+        return self.unpack('?')
+
+    def readInt16(self):
+        return self.unpack('h', 2)
+
+    def readUInt16(self):
+        return self.unpack('H', 2)
+
+    def readInt32(self):
+        return self.unpack('i', 4)
+
+    def readUInt32(self):
+        return self.unpack('I', 4)
+
+    def readInt64(self):
+        return self.unpack('q', 8)
+
+    def readUInt64(self):
+        return self.unpack('Q', 8)
+
+    def readFloat(self):
+        return self.unpack('f', 4)
+
+    def readDouble(self):
+        return self.unpack('d', 8)
+
+    def readString(self):
+        length = self.readUInt16()
+        return self.unpack(str(length) + 's', length)
+
+    def writeBytes(self, value):
+        self.base_stream.write(value)
+
+    def writeChar(self, value):
+        self.pack('c', value)
+
+    def writeUChar(self, value):
+        self.pack('C', value)
+
+    def writeBool(self, value):
+        self.pack('?', value)
+
+    def writeInt16(self, value):
+        self.pack('h', value)
+
+    def writeUInt16(self, value):
+        self.pack('H', value)
+
+    def writeInt32(self, value):
+        self.pack('i', value)
+
+    def writeUInt32(self, value):
+        self.pack('I', value)
+
+    def writeInt64(self, value):
+        self.pack('q', value)
+
+    def writeUInt64(self, value):
+        self.pack('Q', value)
+
+    def writeFloat(self, value):
+        self.pack('f', value)
+
+    def writeDouble(self, value):
+        self.pack('d', value)
+
+    def writeString(self, value):
+        length = len(value)
+        self.writeUInt16(length)
+        self.pack(str(length) + 's', value)
+
+    def pack(self, fmt, data):
+        return self.writeBytes(pack(fmt, data))
+
+    def unpack(self, fmt, length = 1):
+        return unpack(fmt, self.readBytes(length))[0]
diff --git a/pbf2db/build_proto.sh b/pbf2db/build_proto.sh
new file mode 100755
index 0000000..53e9966
--- /dev/null
+++ b/pbf2db/build_proto.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+protoc --python_out=. fileformat.proto 
+protoc --python_out=. osmformat.proto 
diff --git a/pbf2db/fileformat.proto b/pbf2db/fileformat.proto
new file mode 100644
index 0000000..f1b540a
--- /dev/null
+++ b/pbf2db/fileformat.proto
@@ -0,0 +1,49 @@
+/** Copyright (c) 2010 Scott A. Crosby. <scott@sacrosby.com>
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as 
+   published by the Free Software Foundation, either version 3 of the 
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+package OSMPBF;
+
+//
+//  STORAGE LAYER: Storing primitives.
+//
+
+message Blob {
+  optional bytes raw = 1; // No compression
+  optional int32 raw_size = 2; // When compressed, the uncompressed size
+
+  // Possible compressed versions of the data.
+  optional bytes zlib_data = 3;
+
+  // PROPOSED feature for LZMA compressed data. SUPPORT IS NOT REQUIRED.
+  optional bytes lzma_data = 4;
+
+  // Formerly used for bzip2 compressed data. Depreciated in 2010.
+  optional bytes OBSOLETE_bzip2_data = 5 [deprecated=true]; // Don't reuse this tag number.
+}
+
+/* A file contains an sequence of fileblock headers, each prefixed by
+their length in network byte order, followed by a data block
+containing the actual data. types staring with a "_" are reserved.
+*/
+
+message BlobHeader {
+  required string type = 1;
+  optional bytes indexdata = 2;
+  required int32 datasize = 3;
+}
+
+
diff --git a/pbf2db/osmformat.proto b/pbf2db/osmformat.proto
new file mode 100644
index 0000000..eaad195
--- /dev/null
+++ b/pbf2db/osmformat.proto
@@ -0,0 +1,225 @@
+/** Copyright (c) 2010 Scott A. Crosby. <scott@sacrosby.com>
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as 
+   published by the Free Software Foundation, either version 3 of the 
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+package OSMPBF;
+
+/* OSM Binary file format 
+
+This is the master schema file of the OSM binary file format. This
+file is designed to support limited random-access and future
+extendability.
+
+A binary OSM file consists of a sequence of FileBlocks (please see
+fileformat.proto). The first fileblock contains a serialized instance
+of HeaderBlock, followed by a sequence of PrimitiveBlock blocks that
+contain the primitives.
+
+Each primitiveblock is designed to be independently parsable. It
+contains a string table storing all strings in that block (keys and
+values in tags, roles in relations, usernames, etc.) as well as
+metadata containing the precision of coordinates or timestamps in that
+block.
+
+A primitiveblock contains a sequence of primitive groups, each
+containing primitives of the same type (nodes, densenodes, ways,
+relations). Coordinates are stored in signed 64-bit integers. Lat&lon
+are measured in units <granularity> nanodegrees. The default of
+granularity of 100 nanodegrees corresponds to about 1cm on the ground,
+and a full lat or lon fits into 32 bits.
+
+Converting an integer to a lattitude or longitude uses the formula:
+$OUT = IN * granularity / 10**9$. Many encoding schemes use delta
+coding when representing nodes and relations.
+
+*/
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+
+/* Contains the file header. */
+
+message HeaderBlock {
+  optional HeaderBBox bbox = 1;
+  /* Additional tags to aid in parsing this dataset */
+  repeated string required_features = 4;
+  repeated string optional_features = 5;
+
+  optional string writingprogram = 16; 
+  optional string source = 17; // From the bbox field.
+}
+
+
+/** The bounding box field in the OSM header. BBOX, as used in the OSM
+header. Units are always in nanodegrees -- they do not obey
+granularity rules. */
+
+message HeaderBBox {
+   required sint64 left = 1;
+   required sint64 right = 2;
+   required sint64 top = 3;
+   required sint64 bottom = 4;
+}
+
+
+///////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+
+
+message PrimitiveBlock {
+  required StringTable stringtable = 1;
+  repeated PrimitiveGroup primitivegroup = 2;
+
+  // Granularity, units of nanodegrees, used to store coordinates in this block
+  optional int32 granularity = 17 [default=100]; 
+  // Offset value between the output coordinates coordinates and the granularity grid in unites of nanodegrees.
+  optional int64 lat_offset = 19 [default=0];
+  optional int64 lon_offset = 20 [default=0]; 
+
+// Granularity of dates, normally represented in units of milliseconds since the 1970 epoch.
+  optional int32 date_granularity = 18 [default=1000]; 
+
+
+  // Proposed extension:
+  //optional BBox bbox = XX;
+}
+
+// Group of OSMPrimitives. All primitives in a group must be the same type.
+message PrimitiveGroup {
+  repeated Node     nodes = 1;
+  optional DenseNodes dense = 2;
+  repeated Way      ways = 3;
+  repeated Relation relations = 4;
+  repeated ChangeSet changesets = 5;
+}
+
+
+/** String table, contains the common strings in each block.
+
+ Note that we reserve index '0' as a delimiter, so the entry at that
+ index in the table is ALWAYS blank and unused.
+
+ */
+message StringTable {
+   repeated bytes s = 1;
+}
+
+/* Optional metadata that may be included into each primitive. */
+message Info {
+   optional int32 version = 1 [default = -1];
+   optional int64 timestamp = 2;
+   optional int64 changeset = 3;
+   optional int32 uid = 4;
+   optional uint32 user_sid = 5; // String IDs
+}
+
+/** Optional metadata that may be included into each primitive. Special dense format used in DenseNodes. */
+message DenseInfo {
+   repeated int32 version = 1 [packed = true]; 
+   repeated sint64 timestamp = 2 [packed = true]; // DELTA coded
+   repeated sint64 changeset = 3 [packed = true]; // DELTA coded
+   repeated sint32 uid = 4 [packed = true]; // DELTA coded
+   repeated sint32 user_sid = 5 [packed = true]; // String IDs for usernames. DELTA coded
+}
+
+
+// THIS IS STUB DESIGN FOR CHANGESETS. NOT USED RIGHT NOW.
+// TODO:    REMOVE THIS?
+message ChangeSet {
+   required int64 id = 1;
+//   
+//   // Parallel arrays.
+//   repeated uint32 keys = 2 [packed = true]; // String IDs.
+//   repeated uint32 vals = 3 [packed = true]; // String IDs.
+//
+//   optional Info info = 4;
+
+//   optional int64 created_at = 8;
+//   optional int64 closetime_delta = 9;
+//   optional bool open = 10;
+//   optional HeaderBBox bbox = 11;
+}
+
+
+message Node {
+   required sint64 id = 1;
+   // Parallel arrays.
+   repeated uint32 keys = 2 [packed = true]; // String IDs.
+   repeated uint32 vals = 3 [packed = true]; // String IDs.
+
+   optional Info info = 4; // May be omitted in omitmeta
+
+   required sint64 lat = 8;
+   required sint64 lon = 9;
+}
+
+/* Used to densly represent a sequence of nodes that do not have any tags.
+
+We represent these nodes columnwise as five columns: ID's, lats, and
+lons, all delta coded. When metadata is not omitted, 
+
+We encode keys & vals for all nodes as a single array of integers
+containing key-stringid and val-stringid, using a stringid of 0 as a
+delimiter between nodes.
+
+   ( (<keyid> <valid>)* '0' )*
+ */
+
+message DenseNodes {
+   repeated sint64 id = 1 [packed = true]; // DELTA coded
+
+   //repeated Info info = 4;
+   optional DenseInfo denseinfo = 5;
+
+   repeated sint64 lat = 8 [packed = true]; // DELTA coded
+   repeated sint64 lon = 9 [packed = true]; // DELTA coded
+
+   // Special packing of keys and vals into one array. May be empty if all nodes in this block are tagless.
+   repeated int32 keys_vals = 10 [packed = true]; 
+}
+
+
+message Way {
+   required int64 id = 1;
+   // Parallel arrays.
+   repeated uint32 keys = 2 [packed = true];
+   repeated uint32 vals = 3 [packed = true];
+
+   optional Info info = 4;
+
+   repeated sint64 refs = 8 [packed = true];  // DELTA coded
+}
+
+message Relation {
+  enum MemberType {
+    NODE = 0;
+    WAY = 1;
+    RELATION = 2;
+  } 
+   required int64 id = 1;
+
+   // Parallel arrays.
+   repeated uint32 keys = 2 [packed = true];
+   repeated uint32 vals = 3 [packed = true];
+
+   optional Info info = 4;
+
+   // Parallel arrays
+   repeated int32 roles_sid = 8 [packed = true];
+   repeated sint64 memids = 9 [packed = true]; // DELTA encoded
+   repeated MemberType types = 10 [packed = true];
+}
+
diff --git a/pbf2db/pbf2db.py b/pbf2db/pbf2db.py
new file mode 100755
index 0000000..c6089fc
--- /dev/null
+++ b/pbf2db/pbf2db.py
@@ -0,0 +1,44 @@
+#!/usr/bin/python
+
+import osmformat_pb2
+import fileformat_pb2
+import sys
+import socket
+import zlib
+from binarystream import BinaryStream
+
+
+headerSizeMax = 64 * 1024
+bodySizeMax = 32*1024*1024
+
+f = open("berlin.osm.pbf")
+stream = BinaryStream(f)
+headerSize = socket.ntohl(stream.readUInt32());
+
+if headerSizeMax < headerSize:
+    raise ValueError("Header to long")
+
+headerbuff = stream.readBytes(headerSize)
+blobheader = fileformat_pb2.BlobHeader()
+blobheader.ParseFromString(headerbuff)
+bodysize = blobheader.datasize
+
+if bodySizeMax < bodysize:
+    raise ValueError("Body to fat")
+
+blobbuff = stream.readBytes(bodysize)
+blob = fileformat_pb2.Blob()
+blob.ParseFromString(blobbuff)
+
+if blob.raw != "": 
+    rawstr = blob.raw
+else:
+    rawstr = zlib.decompress(blob.zlib_data)
+
+headerblock = osmformat_pb2.HeaderBlock()
+headerblock.ParseFromString(rawstr)
+
+print "Source:",headerblock.source
+print "Writingprog:",headerblock.writingprogram
+print "required features:",headerblock.required_features
+