diff options
Diffstat (limited to 'pbf2db')
-rw-r--r-- | pbf2db/binarystream.py | 95 | ||||
-rwxr-xr-x | pbf2db/build_proto.sh | 3 | ||||
-rw-r--r-- | pbf2db/fileformat.proto | 49 | ||||
-rw-r--r-- | pbf2db/osmformat.proto | 225 | ||||
-rwxr-xr-x | pbf2db/pbf2db.py | 44 |
5 files changed, 416 insertions, 0 deletions
diff --git a/pbf2db/binarystream.py b/pbf2db/binarystream.py new file mode 100644 index 0000000..e1e0280 --- /dev/null +++ b/pbf2db/binarystream.py @@ -0,0 +1,95 @@ +from struct import * + +class BinaryStream: + def __init__(self, base_stream): + self.base_stream = base_stream + + def readByte(self): + return self.base_stream.read(1) + + def readBytes(self, length): + return self.base_stream.read(length) + + def readChar(self): + return self.unpack('b') + + def readUChar(self): + return self.unpack('B') + + def readBool(self): + return self.unpack('?') + + def readInt16(self): + return self.unpack('h', 2) + + def readUInt16(self): + return self.unpack('H', 2) + + def readInt32(self): + return self.unpack('i', 4) + + def readUInt32(self): + return self.unpack('I', 4) + + def readInt64(self): + return self.unpack('q', 8) + + def readUInt64(self): + return self.unpack('Q', 8) + + def readFloat(self): + return self.unpack('f', 4) + + def readDouble(self): + return self.unpack('d', 8) + + def readString(self): + length = self.readUInt16() + return self.unpack(str(length) + 's', length) + + def writeBytes(self, value): + self.base_stream.write(value) + + def writeChar(self, value): + self.pack('c', value) + + def writeUChar(self, value): + self.pack('C', value) + + def writeBool(self, value): + self.pack('?', value) + + def writeInt16(self, value): + self.pack('h', value) + + def writeUInt16(self, value): + self.pack('H', value) + + def writeInt32(self, value): + self.pack('i', value) + + def writeUInt32(self, value): + self.pack('I', value) + + def writeInt64(self, value): + self.pack('q', value) + + def writeUInt64(self, value): + self.pack('Q', value) + + def writeFloat(self, value): + self.pack('f', value) + + def writeDouble(self, value): + self.pack('d', value) + + def writeString(self, value): + length = len(value) + self.writeUInt16(length) + self.pack(str(length) + 's', value) + + def pack(self, fmt, data): + return self.writeBytes(pack(fmt, data)) + + def unpack(self, fmt, length = 1): + return unpack(fmt, self.readBytes(length))[0] diff --git a/pbf2db/build_proto.sh b/pbf2db/build_proto.sh new file mode 100755 index 0000000..53e9966 --- /dev/null +++ b/pbf2db/build_proto.sh @@ -0,0 +1,3 @@ +#!/bin/sh +protoc --python_out=. fileformat.proto +protoc --python_out=. osmformat.proto diff --git a/pbf2db/fileformat.proto b/pbf2db/fileformat.proto new file mode 100644 index 0000000..f1b540a --- /dev/null +++ b/pbf2db/fileformat.proto @@ -0,0 +1,49 @@ +/** Copyright (c) 2010 Scott A. Crosby. <scott@sacrosby.com> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + +*/ + +package OSMPBF; + +// +// STORAGE LAYER: Storing primitives. +// + +message Blob { + optional bytes raw = 1; // No compression + optional int32 raw_size = 2; // When compressed, the uncompressed size + + // Possible compressed versions of the data. + optional bytes zlib_data = 3; + + // PROPOSED feature for LZMA compressed data. SUPPORT IS NOT REQUIRED. + optional bytes lzma_data = 4; + + // Formerly used for bzip2 compressed data. Depreciated in 2010. + optional bytes OBSOLETE_bzip2_data = 5 [deprecated=true]; // Don't reuse this tag number. +} + +/* A file contains an sequence of fileblock headers, each prefixed by +their length in network byte order, followed by a data block +containing the actual data. types staring with a "_" are reserved. +*/ + +message BlobHeader { + required string type = 1; + optional bytes indexdata = 2; + required int32 datasize = 3; +} + + diff --git a/pbf2db/osmformat.proto b/pbf2db/osmformat.proto new file mode 100644 index 0000000..eaad195 --- /dev/null +++ b/pbf2db/osmformat.proto @@ -0,0 +1,225 @@ +/** Copyright (c) 2010 Scott A. Crosby. <scott@sacrosby.com> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + +*/ + +package OSMPBF; + +/* OSM Binary file format + +This is the master schema file of the OSM binary file format. This +file is designed to support limited random-access and future +extendability. + +A binary OSM file consists of a sequence of FileBlocks (please see +fileformat.proto). The first fileblock contains a serialized instance +of HeaderBlock, followed by a sequence of PrimitiveBlock blocks that +contain the primitives. + +Each primitiveblock is designed to be independently parsable. It +contains a string table storing all strings in that block (keys and +values in tags, roles in relations, usernames, etc.) as well as +metadata containing the precision of coordinates or timestamps in that +block. + +A primitiveblock contains a sequence of primitive groups, each +containing primitives of the same type (nodes, densenodes, ways, +relations). Coordinates are stored in signed 64-bit integers. Lat&lon +are measured in units <granularity> nanodegrees. The default of +granularity of 100 nanodegrees corresponds to about 1cm on the ground, +and a full lat or lon fits into 32 bits. + +Converting an integer to a lattitude or longitude uses the formula: +$OUT = IN * granularity / 10**9$. Many encoding schemes use delta +coding when representing nodes and relations. + +*/ + +////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////// + +/* Contains the file header. */ + +message HeaderBlock { + optional HeaderBBox bbox = 1; + /* Additional tags to aid in parsing this dataset */ + repeated string required_features = 4; + repeated string optional_features = 5; + + optional string writingprogram = 16; + optional string source = 17; // From the bbox field. +} + + +/** The bounding box field in the OSM header. BBOX, as used in the OSM +header. Units are always in nanodegrees -- they do not obey +granularity rules. */ + +message HeaderBBox { + required sint64 left = 1; + required sint64 right = 2; + required sint64 top = 3; + required sint64 bottom = 4; +} + + +/////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + + +message PrimitiveBlock { + required StringTable stringtable = 1; + repeated PrimitiveGroup primitivegroup = 2; + + // Granularity, units of nanodegrees, used to store coordinates in this block + optional int32 granularity = 17 [default=100]; + // Offset value between the output coordinates coordinates and the granularity grid in unites of nanodegrees. + optional int64 lat_offset = 19 [default=0]; + optional int64 lon_offset = 20 [default=0]; + +// Granularity of dates, normally represented in units of milliseconds since the 1970 epoch. + optional int32 date_granularity = 18 [default=1000]; + + + // Proposed extension: + //optional BBox bbox = XX; +} + +// Group of OSMPrimitives. All primitives in a group must be the same type. +message PrimitiveGroup { + repeated Node nodes = 1; + optional DenseNodes dense = 2; + repeated Way ways = 3; + repeated Relation relations = 4; + repeated ChangeSet changesets = 5; +} + + +/** String table, contains the common strings in each block. + + Note that we reserve index '0' as a delimiter, so the entry at that + index in the table is ALWAYS blank and unused. + + */ +message StringTable { + repeated bytes s = 1; +} + +/* Optional metadata that may be included into each primitive. */ +message Info { + optional int32 version = 1 [default = -1]; + optional int64 timestamp = 2; + optional int64 changeset = 3; + optional int32 uid = 4; + optional uint32 user_sid = 5; // String IDs +} + +/** Optional metadata that may be included into each primitive. Special dense format used in DenseNodes. */ +message DenseInfo { + repeated int32 version = 1 [packed = true]; + repeated sint64 timestamp = 2 [packed = true]; // DELTA coded + repeated sint64 changeset = 3 [packed = true]; // DELTA coded + repeated sint32 uid = 4 [packed = true]; // DELTA coded + repeated sint32 user_sid = 5 [packed = true]; // String IDs for usernames. DELTA coded +} + + +// THIS IS STUB DESIGN FOR CHANGESETS. NOT USED RIGHT NOW. +// TODO: REMOVE THIS? +message ChangeSet { + required int64 id = 1; +// +// // Parallel arrays. +// repeated uint32 keys = 2 [packed = true]; // String IDs. +// repeated uint32 vals = 3 [packed = true]; // String IDs. +// +// optional Info info = 4; + +// optional int64 created_at = 8; +// optional int64 closetime_delta = 9; +// optional bool open = 10; +// optional HeaderBBox bbox = 11; +} + + +message Node { + required sint64 id = 1; + // Parallel arrays. + repeated uint32 keys = 2 [packed = true]; // String IDs. + repeated uint32 vals = 3 [packed = true]; // String IDs. + + optional Info info = 4; // May be omitted in omitmeta + + required sint64 lat = 8; + required sint64 lon = 9; +} + +/* Used to densly represent a sequence of nodes that do not have any tags. + +We represent these nodes columnwise as five columns: ID's, lats, and +lons, all delta coded. When metadata is not omitted, + +We encode keys & vals for all nodes as a single array of integers +containing key-stringid and val-stringid, using a stringid of 0 as a +delimiter between nodes. + + ( (<keyid> <valid>)* '0' )* + */ + +message DenseNodes { + repeated sint64 id = 1 [packed = true]; // DELTA coded + + //repeated Info info = 4; + optional DenseInfo denseinfo = 5; + + repeated sint64 lat = 8 [packed = true]; // DELTA coded + repeated sint64 lon = 9 [packed = true]; // DELTA coded + + // Special packing of keys and vals into one array. May be empty if all nodes in this block are tagless. + repeated int32 keys_vals = 10 [packed = true]; +} + + +message Way { + required int64 id = 1; + // Parallel arrays. + repeated uint32 keys = 2 [packed = true]; + repeated uint32 vals = 3 [packed = true]; + + optional Info info = 4; + + repeated sint64 refs = 8 [packed = true]; // DELTA coded +} + +message Relation { + enum MemberType { + NODE = 0; + WAY = 1; + RELATION = 2; + } + required int64 id = 1; + + // Parallel arrays. + repeated uint32 keys = 2 [packed = true]; + repeated uint32 vals = 3 [packed = true]; + + optional Info info = 4; + + // Parallel arrays + repeated int32 roles_sid = 8 [packed = true]; + repeated sint64 memids = 9 [packed = true]; // DELTA encoded + repeated MemberType types = 10 [packed = true]; +} + diff --git a/pbf2db/pbf2db.py b/pbf2db/pbf2db.py new file mode 100755 index 0000000..c6089fc --- /dev/null +++ b/pbf2db/pbf2db.py @@ -0,0 +1,44 @@ +#!/usr/bin/python + +import osmformat_pb2 +import fileformat_pb2 +import sys +import socket +import zlib +from binarystream import BinaryStream + + +headerSizeMax = 64 * 1024 +bodySizeMax = 32*1024*1024 + +f = open("berlin.osm.pbf") +stream = BinaryStream(f) +headerSize = socket.ntohl(stream.readUInt32()); + +if headerSizeMax < headerSize: + raise ValueError("Header to long") + +headerbuff = stream.readBytes(headerSize) +blobheader = fileformat_pb2.BlobHeader() +blobheader.ParseFromString(headerbuff) +bodysize = blobheader.datasize + +if bodySizeMax < bodysize: + raise ValueError("Body to fat") + +blobbuff = stream.readBytes(bodysize) +blob = fileformat_pb2.Blob() +blob.ParseFromString(blobbuff) + +if blob.raw != "": + rawstr = blob.raw +else: + rawstr = zlib.decompress(blob.zlib_data) + +headerblock = osmformat_pb2.HeaderBlock() +headerblock.ParseFromString(rawstr) + +print "Source:",headerblock.source +print "Writingprog:",headerblock.writingprogram +print "required features:",headerblock.required_features + |