aboutsummaryrefslogtreecommitdiffstats
path: root/pbf2db
diff options
context:
space:
mode:
Diffstat (limited to 'pbf2db')
-rw-r--r--pbf2db/binarystream.py95
-rwxr-xr-xpbf2db/build_proto.sh3
-rw-r--r--pbf2db/fileformat.proto49
-rw-r--r--pbf2db/osmformat.proto225
-rwxr-xr-xpbf2db/pbf2db.py44
5 files changed, 416 insertions, 0 deletions
diff --git a/pbf2db/binarystream.py b/pbf2db/binarystream.py
new file mode 100644
index 0000000..e1e0280
--- /dev/null
+++ b/pbf2db/binarystream.py
@@ -0,0 +1,95 @@
+from struct import *
+
+class BinaryStream:
+ def __init__(self, base_stream):
+ self.base_stream = base_stream
+
+ def readByte(self):
+ return self.base_stream.read(1)
+
+ def readBytes(self, length):
+ return self.base_stream.read(length)
+
+ def readChar(self):
+ return self.unpack('b')
+
+ def readUChar(self):
+ return self.unpack('B')
+
+ def readBool(self):
+ return self.unpack('?')
+
+ def readInt16(self):
+ return self.unpack('h', 2)
+
+ def readUInt16(self):
+ return self.unpack('H', 2)
+
+ def readInt32(self):
+ return self.unpack('i', 4)
+
+ def readUInt32(self):
+ return self.unpack('I', 4)
+
+ def readInt64(self):
+ return self.unpack('q', 8)
+
+ def readUInt64(self):
+ return self.unpack('Q', 8)
+
+ def readFloat(self):
+ return self.unpack('f', 4)
+
+ def readDouble(self):
+ return self.unpack('d', 8)
+
+ def readString(self):
+ length = self.readUInt16()
+ return self.unpack(str(length) + 's', length)
+
+ def writeBytes(self, value):
+ self.base_stream.write(value)
+
+ def writeChar(self, value):
+ self.pack('c', value)
+
+ def writeUChar(self, value):
+ self.pack('C', value)
+
+ def writeBool(self, value):
+ self.pack('?', value)
+
+ def writeInt16(self, value):
+ self.pack('h', value)
+
+ def writeUInt16(self, value):
+ self.pack('H', value)
+
+ def writeInt32(self, value):
+ self.pack('i', value)
+
+ def writeUInt32(self, value):
+ self.pack('I', value)
+
+ def writeInt64(self, value):
+ self.pack('q', value)
+
+ def writeUInt64(self, value):
+ self.pack('Q', value)
+
+ def writeFloat(self, value):
+ self.pack('f', value)
+
+ def writeDouble(self, value):
+ self.pack('d', value)
+
+ def writeString(self, value):
+ length = len(value)
+ self.writeUInt16(length)
+ self.pack(str(length) + 's', value)
+
+ def pack(self, fmt, data):
+ return self.writeBytes(pack(fmt, data))
+
+ def unpack(self, fmt, length = 1):
+ return unpack(fmt, self.readBytes(length))[0]
diff --git a/pbf2db/build_proto.sh b/pbf2db/build_proto.sh
new file mode 100755
index 0000000..53e9966
--- /dev/null
+++ b/pbf2db/build_proto.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+protoc --python_out=. fileformat.proto
+protoc --python_out=. osmformat.proto
diff --git a/pbf2db/fileformat.proto b/pbf2db/fileformat.proto
new file mode 100644
index 0000000..f1b540a
--- /dev/null
+++ b/pbf2db/fileformat.proto
@@ -0,0 +1,49 @@
+/** Copyright (c) 2010 Scott A. Crosby. <scott@sacrosby.com>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+package OSMPBF;
+
+//
+// STORAGE LAYER: Storing primitives.
+//
+
+message Blob {
+ optional bytes raw = 1; // No compression
+ optional int32 raw_size = 2; // When compressed, the uncompressed size
+
+ // Possible compressed versions of the data.
+ optional bytes zlib_data = 3;
+
+ // PROPOSED feature for LZMA compressed data. SUPPORT IS NOT REQUIRED.
+ optional bytes lzma_data = 4;
+
+ // Formerly used for bzip2 compressed data. Depreciated in 2010.
+ optional bytes OBSOLETE_bzip2_data = 5 [deprecated=true]; // Don't reuse this tag number.
+}
+
+/* A file contains an sequence of fileblock headers, each prefixed by
+their length in network byte order, followed by a data block
+containing the actual data. types staring with a "_" are reserved.
+*/
+
+message BlobHeader {
+ required string type = 1;
+ optional bytes indexdata = 2;
+ required int32 datasize = 3;
+}
+
+
diff --git a/pbf2db/osmformat.proto b/pbf2db/osmformat.proto
new file mode 100644
index 0000000..eaad195
--- /dev/null
+++ b/pbf2db/osmformat.proto
@@ -0,0 +1,225 @@
+/** Copyright (c) 2010 Scott A. Crosby. <scott@sacrosby.com>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+package OSMPBF;
+
+/* OSM Binary file format
+
+This is the master schema file of the OSM binary file format. This
+file is designed to support limited random-access and future
+extendability.
+
+A binary OSM file consists of a sequence of FileBlocks (please see
+fileformat.proto). The first fileblock contains a serialized instance
+of HeaderBlock, followed by a sequence of PrimitiveBlock blocks that
+contain the primitives.
+
+Each primitiveblock is designed to be independently parsable. It
+contains a string table storing all strings in that block (keys and
+values in tags, roles in relations, usernames, etc.) as well as
+metadata containing the precision of coordinates or timestamps in that
+block.
+
+A primitiveblock contains a sequence of primitive groups, each
+containing primitives of the same type (nodes, densenodes, ways,
+relations). Coordinates are stored in signed 64-bit integers. Lat&lon
+are measured in units <granularity> nanodegrees. The default of
+granularity of 100 nanodegrees corresponds to about 1cm on the ground,
+and a full lat or lon fits into 32 bits.
+
+Converting an integer to a lattitude or longitude uses the formula:
+$OUT = IN * granularity / 10**9$. Many encoding schemes use delta
+coding when representing nodes and relations.
+
+*/
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+
+/* Contains the file header. */
+
+message HeaderBlock {
+ optional HeaderBBox bbox = 1;
+ /* Additional tags to aid in parsing this dataset */
+ repeated string required_features = 4;
+ repeated string optional_features = 5;
+
+ optional string writingprogram = 16;
+ optional string source = 17; // From the bbox field.
+}
+
+
+/** The bounding box field in the OSM header. BBOX, as used in the OSM
+header. Units are always in nanodegrees -- they do not obey
+granularity rules. */
+
+message HeaderBBox {
+ required sint64 left = 1;
+ required sint64 right = 2;
+ required sint64 top = 3;
+ required sint64 bottom = 4;
+}
+
+
+///////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+
+
+message PrimitiveBlock {
+ required StringTable stringtable = 1;
+ repeated PrimitiveGroup primitivegroup = 2;
+
+ // Granularity, units of nanodegrees, used to store coordinates in this block
+ optional int32 granularity = 17 [default=100];
+ // Offset value between the output coordinates coordinates and the granularity grid in unites of nanodegrees.
+ optional int64 lat_offset = 19 [default=0];
+ optional int64 lon_offset = 20 [default=0];
+
+// Granularity of dates, normally represented in units of milliseconds since the 1970 epoch.
+ optional int32 date_granularity = 18 [default=1000];
+
+
+ // Proposed extension:
+ //optional BBox bbox = XX;
+}
+
+// Group of OSMPrimitives. All primitives in a group must be the same type.
+message PrimitiveGroup {
+ repeated Node nodes = 1;
+ optional DenseNodes dense = 2;
+ repeated Way ways = 3;
+ repeated Relation relations = 4;
+ repeated ChangeSet changesets = 5;
+}
+
+
+/** String table, contains the common strings in each block.
+
+ Note that we reserve index '0' as a delimiter, so the entry at that
+ index in the table is ALWAYS blank and unused.
+
+ */
+message StringTable {
+ repeated bytes s = 1;
+}
+
+/* Optional metadata that may be included into each primitive. */
+message Info {
+ optional int32 version = 1 [default = -1];
+ optional int64 timestamp = 2;
+ optional int64 changeset = 3;
+ optional int32 uid = 4;
+ optional uint32 user_sid = 5; // String IDs
+}
+
+/** Optional metadata that may be included into each primitive. Special dense format used in DenseNodes. */
+message DenseInfo {
+ repeated int32 version = 1 [packed = true];
+ repeated sint64 timestamp = 2 [packed = true]; // DELTA coded
+ repeated sint64 changeset = 3 [packed = true]; // DELTA coded
+ repeated sint32 uid = 4 [packed = true]; // DELTA coded
+ repeated sint32 user_sid = 5 [packed = true]; // String IDs for usernames. DELTA coded
+}
+
+
+// THIS IS STUB DESIGN FOR CHANGESETS. NOT USED RIGHT NOW.
+// TODO: REMOVE THIS?
+message ChangeSet {
+ required int64 id = 1;
+//
+// // Parallel arrays.
+// repeated uint32 keys = 2 [packed = true]; // String IDs.
+// repeated uint32 vals = 3 [packed = true]; // String IDs.
+//
+// optional Info info = 4;
+
+// optional int64 created_at = 8;
+// optional int64 closetime_delta = 9;
+// optional bool open = 10;
+// optional HeaderBBox bbox = 11;
+}
+
+
+message Node {
+ required sint64 id = 1;
+ // Parallel arrays.
+ repeated uint32 keys = 2 [packed = true]; // String IDs.
+ repeated uint32 vals = 3 [packed = true]; // String IDs.
+
+ optional Info info = 4; // May be omitted in omitmeta
+
+ required sint64 lat = 8;
+ required sint64 lon = 9;
+}
+
+/* Used to densly represent a sequence of nodes that do not have any tags.
+
+We represent these nodes columnwise as five columns: ID's, lats, and
+lons, all delta coded. When metadata is not omitted,
+
+We encode keys & vals for all nodes as a single array of integers
+containing key-stringid and val-stringid, using a stringid of 0 as a
+delimiter between nodes.
+
+ ( (<keyid> <valid>)* '0' )*
+ */
+
+message DenseNodes {
+ repeated sint64 id = 1 [packed = true]; // DELTA coded
+
+ //repeated Info info = 4;
+ optional DenseInfo denseinfo = 5;
+
+ repeated sint64 lat = 8 [packed = true]; // DELTA coded
+ repeated sint64 lon = 9 [packed = true]; // DELTA coded
+
+ // Special packing of keys and vals into one array. May be empty if all nodes in this block are tagless.
+ repeated int32 keys_vals = 10 [packed = true];
+}
+
+
+message Way {
+ required int64 id = 1;
+ // Parallel arrays.
+ repeated uint32 keys = 2 [packed = true];
+ repeated uint32 vals = 3 [packed = true];
+
+ optional Info info = 4;
+
+ repeated sint64 refs = 8 [packed = true]; // DELTA coded
+}
+
+message Relation {
+ enum MemberType {
+ NODE = 0;
+ WAY = 1;
+ RELATION = 2;
+ }
+ required int64 id = 1;
+
+ // Parallel arrays.
+ repeated uint32 keys = 2 [packed = true];
+ repeated uint32 vals = 3 [packed = true];
+
+ optional Info info = 4;
+
+ // Parallel arrays
+ repeated int32 roles_sid = 8 [packed = true];
+ repeated sint64 memids = 9 [packed = true]; // DELTA encoded
+ repeated MemberType types = 10 [packed = true];
+}
+
diff --git a/pbf2db/pbf2db.py b/pbf2db/pbf2db.py
new file mode 100755
index 0000000..c6089fc
--- /dev/null
+++ b/pbf2db/pbf2db.py
@@ -0,0 +1,44 @@
+#!/usr/bin/python
+
+import osmformat_pb2
+import fileformat_pb2
+import sys
+import socket
+import zlib
+from binarystream import BinaryStream
+
+
+headerSizeMax = 64 * 1024
+bodySizeMax = 32*1024*1024
+
+f = open("berlin.osm.pbf")
+stream = BinaryStream(f)
+headerSize = socket.ntohl(stream.readUInt32());
+
+if headerSizeMax < headerSize:
+ raise ValueError("Header to long")
+
+headerbuff = stream.readBytes(headerSize)
+blobheader = fileformat_pb2.BlobHeader()
+blobheader.ParseFromString(headerbuff)
+bodysize = blobheader.datasize
+
+if bodySizeMax < bodysize:
+ raise ValueError("Body to fat")
+
+blobbuff = stream.readBytes(bodysize)
+blob = fileformat_pb2.Blob()
+blob.ParseFromString(blobbuff)
+
+if blob.raw != "":
+ rawstr = blob.raw
+else:
+ rawstr = zlib.decompress(blob.zlib_data)
+
+headerblock = osmformat_pb2.HeaderBlock()
+headerblock.ParseFromString(rawstr)
+
+print "Source:",headerblock.source
+print "Writingprog:",headerblock.writingprogram
+print "required features:",headerblock.required_features
+