Mercurial > hg > Members > shoshi > TreeCMSv1
changeset 53:7a558c7d4f41
added storage-conf.xml
author | shoshi |
---|---|
date | Wed, 16 Feb 2011 18:43:58 +0900 |
parents | 1b78f1f3add3 |
children | d830fb5aeece |
files | cassandra/storage-conf.xml src/treecms/demo/ContentsTreeBuilder.java |
diffstat | 2 files changed, 421 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cassandra/storage-conf.xml Wed Feb 16 18:43:58 2011 +0900 @@ -0,0 +1,421 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one + ~ or more contributor license agreements. See the NOTICE file + ~ distributed with this work for additional information + ~ regarding copyright ownership. The ASF licenses this file + ~ to you under the Apache License, Version 2.0 (the + ~ "License"); you may not use this file except in compliance + ~ with the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, + ~ software distributed under the License is distributed on an + ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + ~ KIND, either express or implied. See the License for the + ~ specific language governing permissions and limitations + ~ under the License. +--> +<Storage> + <!--======================================================================--> + <!-- Basic Configuration --> + <!--======================================================================--> + + <!-- + ~ The name of this cluster. This is mainly used to prevent machines in + ~ one logical cluster from joining another. + --> + <ClusterName>TreeCMS Cluster</ClusterName> + + <!-- + ~ Turn on to make new [non-seed] nodes automatically migrate the right data + ~ to themselves. (If no InitialToken is specified, they will pick one + ~ such that they will get half the range of the most-loaded node.) + ~ If a node starts up without bootstrapping, it will mark itself bootstrapped + ~ so that you can't subsequently accidently bootstrap a node with + ~ data on it. (You can reset this by wiping your data and commitlog + ~ directories.) + ~ + ~ Off by default so that new clusters and upgraders from 0.4 don't + ~ bootstrap immediately. You should turn this on when you start adding + ~ new nodes to a cluster that already has data on it. (If you are upgrading + ~ from 0.4, start your cluster with it off once before changing it to true. + ~ Otherwise, no data will be lost but you will incur a lot of unnecessary + ~ I/O before your cluster starts up.) + --> + <AutoBootstrap>false</AutoBootstrap> + + <!-- + ~ See http://wiki.apache.org/cassandra/HintedHandoff + --> + <HintedHandoffEnabled>true</HintedHandoffEnabled> + + <!-- + ~ The Index Interval determines the rate of sampling of row keys + ~ is for a given SSTable: 1/IndexInterval keys are held in memory + ~ for the lifetime of the sstable for use during key lookup. + ~ (This is separate from the KeyCache.) Larger intervals will result + ~ in lower memory usage at the cost of slower row lookup at read time. + --> + <IndexInterval>128</IndexInterval> + + <!-- + ~ Keyspaces and ColumnFamilies: + ~ A ColumnFamily is the Cassandra concept closest to a relational + ~ table. Keyspaces are separate groups of ColumnFamilies. Except in + ~ very unusual circumstances you will have one Keyspace per application. + + ~ There is an implicit keyspace named 'system' for Cassandra internals. + --> + <Keyspaces> + <Keyspace Name="TreeCMSKS"> + <ColumnFamily Name="TreeCMSCF01" CompareWith="BytesType"/> + <ColumnFamily Name="TreeCMSCF02" CompareWith="UTF8Type"/> + <ColumnFamily Name="TreeCMSCF03" CompareWith="UTF8Type"/> + <ReplicaPlacementStrategy>org.apache.cassandra.locator.RackUnawareStrategy</ReplicaPlacementStrategy> + <ReplicationFactor>1</ReplicationFactor> + <EndPointSnitch>org.apache.cassandra.locator.EndPointSnitch</EndPointSnitch> + </Keyspace> + <Keyspace Name="Keyspace1"> + <!-- + ~ ColumnFamily definitions have one required attribute (Name) + ~ and several optional ones. + ~ + ~ The CompareWith attribute tells Cassandra how to sort the columns + ~ for slicing operations. The default is BytesType, which is a + ~ straightforward lexical comparison of the bytes in each column. + ~ Other options are AsciiType, UTF8Type, LexicalUUIDType, TimeUUIDType, + ~ and LongType. You can also specify the fully-qualified class + ~ name to a class of your choice extending + ~ org.apache.cassandra.db.marshal.AbstractType. + ~ + ~ SuperColumns have a similar CompareSubcolumnsWith attribute. + ~ + ~ BytesType: Simple sort by byte value. No validation is performed. + ~ AsciiType: Like BytesType, but validates that the input can be + ~ parsed as US-ASCII. + ~ UTF8Type: A string encoded as UTF8 + ~ LongType: A 64bit long + ~ LexicalUUIDType: A 128bit UUID, compared lexically (by byte value) + ~ TimeUUIDType: a 128bit version 1 UUID, compared by timestamp + ~ + ~ (To get the closest approximation to 0.3-style supercolumns, you + ~ would use CompareWith=UTF8Type CompareSubcolumnsWith=LongType.) + ~ + ~ An optional `Comment` attribute may be used to attach additional + ~ human-readable information about the column family to its definition. + ~ + ~ The optional KeysCached attribute specifies + ~ the number of keys per sstable whose locations we keep in + ~ memory in "mostly LRU" order. (JUST the key locations, NOT any + ~ column values.) Specify a fraction (value less than 1), a percentage + ~ (ending in a % sign) or an absolute number of keys to cache. + ~ KeysCached defaults to 200000 keys. + ~ + ~ The optional RowsCached attribute specifies the number of rows + ~ whose entire contents we cache in memory. Do not use this on + ~ ColumnFamilies with large rows, or ColumnFamilies with high write:read + ~ ratios. Specify a fraction (value less than 1), a percentage (ending in + ~ a % sign) or an absolute number of rows to cache. + ~ RowsCached defaults to 0, i.e., row cache is off by default. + ~ + ~ Row and key caches may also be saved periodically; if so, the last- + ~ saved cache will be loaded in at server start. By default, cache + ~ saving is off. + ~ + ~ Remember, when using caches as a percentage, they WILL grow with + ~ your data set! + --> + <ColumnFamily Name="Standard1" CompareWith="BytesType" + KeysCached="1000" + RowsCached="100" + RowCacheSavePeriodInSeconds="0" + KeyCacheSavePeriodInSeconds="3600"/> + <ColumnFamily Name="Standard2" + CompareWith="UTF8Type" + KeysCached="100%"/> + <ColumnFamily Name="StandardByUUID1" CompareWith="TimeUUIDType" /> + <ColumnFamily Name="Super1" + ColumnType="Super" + CompareWith="BytesType" + CompareSubcolumnsWith="BytesType" /> + <ColumnFamily Name="Super2" + ColumnType="Super" + CompareWith="UTF8Type" + CompareSubcolumnsWith="UTF8Type" + RowsCached="10000" + KeysCached="50%" + Comment="A column family with supercolumns, whose column and subcolumn names are UTF8 strings"/> + + <!-- + ~ Strategy: The class that extends AbstractReplicationStrategy + ~ determines how replicas are placed around the token ring. + ~ Out of the box, Cassandra provides + ~ org.apache.cassandra.locator.RackUnawareStrategy and + ~ org.apache.cassandra.locator.RackAwareStrategy (place one replica in + ~ a different datacenter, and the others on different racks in the same + ~ one.) + --> + <ReplicaPlacementStrategy>org.apache.cassandra.locator.RackUnawareStrategy</ReplicaPlacementStrategy> + + <!-- Number of replicas of the data --> + <ReplicationFactor>1</ReplicationFactor> + + <!-- + ~ EndPointSnitch: Setting this to the class that implements + ~ AbstractEndpointSnitch, which lets Cassandra know enough + ~ about your network topology to route requests efficiently. + ~ Out of the box, Cassandra provides org.apache.cassandra.locator.EndPointSnitch, + ~ and PropertyFileEndPointSnitch is available in contrib/. + --> + <EndPointSnitch>org.apache.cassandra.locator.EndPointSnitch</EndPointSnitch> + + </Keyspace> + </Keyspaces> + + <!-- + ~ Authenticator: any IAuthenticator may be used, including your own as long + ~ as it is on the classpath. Out of the box, Cassandra provides + ~ org.apache.cassandra.auth.AllowAllAuthenticator and, + ~ org.apache.cassandra.auth.SimpleAuthenticator + ~ (SimpleAuthenticator uses access.properties and passwd.properties by + ~ default). + ~ + ~ If you don't specify an authenticator, AllowAllAuthenticator is used. + --> + <Authenticator>org.apache.cassandra.auth.AllowAllAuthenticator</Authenticator> + + <!-- + ~ Partitioner: any IPartitioner may be used, including your own as long + ~ as it is on the classpath. Out of the box, Cassandra provides + ~ org.apache.cassandra.dht.RandomPartitioner, + ~ org.apache.cassandra.dht.OrderPreservingPartitioner, and + ~ org.apache.cassandra.dht.CollatingOrderPreservingPartitioner. + ~ (CollatingOPP colates according to EN,US rules, not naive byte + ~ ordering. Use this as an example if you need locale-aware collation.) + ~ Range queries require using an order-preserving partitioner. + ~ + ~ Achtung! Changing this parameter requires wiping your data + ~ directories, since the partitioner can modify the sstable on-disk + ~ format. + --> + <Partitioner>org.apache.cassandra.dht.OrderPreservingPartitioner</Partitioner> + + <!-- + ~ You should always specify InitialToken when setting up a production + ~ cluster for the first time, and often when adding capacity later. + ~ The principle is that each node should be given an equal slice of + ~ the token ring; see http://wiki.apache.org/cassandra/Operations + ~ for more details. + ~ + ~ If blank, Cassandra will request a token bisecting the range of + ~ the heaviest-loaded existing node. If there is no load information + ~ available, such as is the case with a new cluster, it will pick + ~ a random token, which will lead to hot spots. + --> + <InitialToken></InitialToken> + + <!-- + ~ Directories: Specify where Cassandra should store different data on + ~ disk. Keep the data disks and the CommitLog disks separate for best + ~ performance + --> + <SavedCachesDirectory>var/lib/cassandra/saved_caches</SavedCachesDirectory> + <CommitLogDirectory>var/lib/cassandra/commitlog</CommitLogDirectory> + <DataFileDirectories> + <DataFileDirectory>var/lib/cassandra/data</DataFileDirectory> + </DataFileDirectories> + + + <!-- + ~ Addresses of hosts that are deemed contact points. Cassandra nodes + ~ use this list of hosts to find each other and learn the topology of + ~ the ring. You must change this if you are running multiple nodes! + --> + <Seeds> + <Seed>127.0.0.1</Seed> + </Seeds> + + + <!-- Miscellaneous --> + + <!-- Time to wait for a reply from other nodes before failing the command --> + <RpcTimeoutInMillis>10000</RpcTimeoutInMillis> + <!-- phi value that must be reached before a host is marked as down. + most users should never adjust this --> + <!-- PhiConvictThreshold>8</PhiConvictThreshold --> + <!-- Size to allow commitlog to grow to before creating a new segment --> + <CommitLogRotationThresholdInMB>128</CommitLogRotationThresholdInMB> + + + <!-- Local hosts and ports --> + + <!-- + ~ Address to bind to and tell other nodes to connect to. You _must_ + ~ change this if you want multiple nodes to be able to communicate! + ~ + ~ Leaving it blank leaves it up to InetAddress.getLocalHost(). This + ~ will always do the Right Thing *if* the node is properly configured + ~ (hostname, name resolution, etc), and the Right Thing is to use the + ~ address associated with the hostname (it might not be). + --> + <ListenAddress>localhost</ListenAddress> + <!-- internal communications port --> + <StoragePort>7000</StoragePort> + + <!-- + ~ The address to bind the Thrift RPC service to. Unlike ListenAddress + ~ above, you *can* specify 0.0.0.0 here if you want Thrift to listen on + ~ all interfaces. + ~ + ~ Leaving this blank has the same effect it does for ListenAddress, + ~ (i.e. it will be based on the configured hostname of the node). + --> + <ThriftAddress>::</ThriftAddress> + <!-- Thrift RPC port (the port clients connect to). --> + <ThriftPort>9160</ThriftPort> + <!-- + ~ Whether or not to use a framed transport for Thrift. If this option + ~ is set to true then you must also use a framed transport on the + ~ client-side, (framed and non-framed transports are not compatible). + --> + <ThriftFramedTransport>false</ThriftFramedTransport> + + + <!--======================================================================--> + <!-- Memory, Disk, and Performance --> + <!--======================================================================--> + + <!-- + ~ Access mode. mmapped i/o is substantially faster, but only practical on + ~ a 64bit machine (which notably does not include EC2 "small" instances) + ~ or relatively small datasets. "auto", the safe choice, will enable + ~ mmapping on a 64bit JVM. Other values are "mmap", "mmap_index_only" + ~ (which may allow you to get part of the benefits of mmap on a 32bit + ~ machine by mmapping only index files) and "standard". + ~ (The buffer size settings that follow only apply to standard, + ~ non-mmapped i/o.) + --> + <DiskAccessMode>auto</DiskAccessMode> + + <!-- + ~ Size of compacted row above which to log a warning. If compacted + ~ rows do not fit in memory, Cassandra will crash. (This is explained + ~ in http://wiki.apache.org/cassandra/CassandraLimitations and is + ~ scheduled to be fixed in 0.7.) Large rows can also be a problem + ~ when row caching is enabled. + --> + <RowWarningThresholdInMB>64</RowWarningThresholdInMB> + + <!-- + ~ Buffer size to use when performing contiguous column slices. Increase + ~ this to the size of the column slices you typically perform. + ~ (Name-based queries are performed with a buffer size of + ~ ColumnIndexSizeInKB.) + --> + <SlicedBufferSizeInKB>64</SlicedBufferSizeInKB> + + <!-- + ~ Buffer size to use when flushing memtables to disk. (Only one + ~ memtable is ever flushed at a time.) Increase (decrease) the index + ~ buffer size relative to the data buffer if you have few (many) + ~ columns per key. Bigger is only better _if_ your memtables get large + ~ enough to use the space. (Check in your data directory after your + ~ app has been running long enough.) --> + <FlushDataBufferSizeInMB>32</FlushDataBufferSizeInMB> + <FlushIndexBufferSizeInMB>8</FlushIndexBufferSizeInMB> + + <!-- + ~ Add column indexes to a row after its contents reach this size. + ~ Increase if your column values are large, or if you have a very large + ~ number of columns. The competing causes are, Cassandra has to + ~ deserialize this much of the row to read a single column, so you want + ~ it to be small - at least if you do many partial-row reads - but all + ~ the index data is read for each access, so you don't want to generate + ~ that wastefully either. + --> + <ColumnIndexSizeInKB>64</ColumnIndexSizeInKB> + + <!-- + ~ Flush memtable after this much data has been inserted, including + ~ overwritten data. There is one memtable per column family, and + ~ this threshold is based solely on the amount of data stored, not + ~ actual heap memory usage (there is some overhead in indexing the + ~ columns). + --> + <MemtableThroughputInMB>64</MemtableThroughputInMB> + <!-- + ~ Throughput setting for Binary Memtables. Typically these are + ~ used for bulk load so you want them to be larger. + --> + <BinaryMemtableThroughputInMB>256</BinaryMemtableThroughputInMB> + <!-- + ~ The maximum number of columns in millions to store in memory per + ~ ColumnFamily before flushing to disk. This is also a per-memtable + ~ setting. Use with MemtableThroughputInMB to tune memory usage. + --> + <MemtableOperationsInMillions>0.3</MemtableOperationsInMillions> + <!-- + ~ The maximum time to leave a dirty memtable unflushed. + ~ (While any affected columnfamilies have unflushed data from a + ~ commit log segment, that segment cannot be deleted.) + ~ This needs to be large enough that it won't cause a flush storm + ~ of all your memtables flushing at once because none has hit + ~ the size or count thresholds yet. + --> + <MemtableFlushAfterMinutes>60</MemtableFlushAfterMinutes> + + <!-- + ~ Unlike most systems, in Cassandra writes are faster than reads, so + ~ you can afford more of those in parallel. A good rule of thumb is 2 + ~ concurrent reads per processor core. Increase ConcurrentWrites to + ~ the number of clients writing at once if you enable CommitLogSync + + ~ CommitLogSyncDelay. --> + <ConcurrentReads>8</ConcurrentReads> + <ConcurrentWrites>32</ConcurrentWrites> + + <!-- + ~ CommitLogSync may be either "periodic" or "batch." When in batch + ~ mode, Cassandra won't ack writes until the commit log has been + ~ fsynced to disk. It will wait up to CommitLogSyncBatchWindowInMS + ~ milliseconds for other writes, before performing the sync. + + ~ This is less necessary in Cassandra than in traditional databases + ~ since replication reduces the odds of losing data from a failure + ~ after writing the log entry but before it actually reaches the disk. + ~ So the other option is "periodic," where writes may be acked immediately + ~ and the CommitLog is simply synced every CommitLogSyncPeriodInMS + ~ milliseconds. + --> + <CommitLogSync>periodic</CommitLogSync> + <!-- + ~ Interval at which to perform syncs of the CommitLog in periodic mode. + ~ Usually the default of 10000ms is fine; increase it if your i/o + ~ load is such that syncs are taking excessively long times. + --> + <CommitLogSyncPeriodInMS>10000</CommitLogSyncPeriodInMS> + <!-- + ~ Delay (in milliseconds) during which additional commit log entries + ~ may be written before fsync in batch mode. This will increase + ~ latency slightly, but can vastly improve throughput where there are + ~ many writers. Set to zero to disable (each entry will be synced + ~ individually). Reasonable values range from a minimal 0.1 to 10 or + ~ even more if throughput matters more than latency. + --> + <!-- <CommitLogSyncBatchWindowInMS>1</CommitLogSyncBatchWindowInMS> --> + + <!-- + ~ Time to wait before garbage-collection deletion markers. Set this to + ~ a large enough value that you are confident that the deletion marker + ~ will be propagated to all replicas by the time this many seconds has + ~ elapsed, even in the face of hardware failures. The default value is + ~ ten days. + --> + <GCGraceSeconds>864000</GCGraceSeconds> + + <!-- + ~ Enables or disables Read Repair. + ~ See http://wiki.apache.org/cassandra/ReadRepair + --> + <DoConsistencyChecksBoolean>true</DoConsistencyChecksBoolean> +</Storage>
--- a/src/treecms/demo/ContentsTreeBuilder.java Fri Feb 11 18:17:15 2011 +0900 +++ b/src/treecms/demo/ContentsTreeBuilder.java Wed Feb 16 18:43:58 2011 +0900 @@ -2,9 +2,6 @@ import java.io.File; import java.io.FileInputStream; - -import org.apache.commons.codec.binary.Base64; - import treecms.proto.api.Node; import treecms.proto.simple.SimpleBrowser;