创建session时可以传入session_options.
C++实现在:tensorflow/core/public/session_options.h中
Env:执行环境
target: 也就是要在哪里执行图,可以是本地,也可以连接到远程server上。
ConfigProto是最核心的配置,其他功能都在此配置里
struct SessionOptions {
Env* env;
target是:local, ip:port, host:port这三种,也可以是多个
std::string target;
/// Configuration options.
ConfigProto config;
SessionOptions();
};
ConfigProto
ConfigProto是用proto定义的:tensorflow/core/protobuf/config.proto
其中包含了很多配置
- GPU配置
- GRPC配置
- 图配置
- 设备配置:映射,放置
- 分布式集群配置
- 线程池配置
- sess.run配置
- debug配置
- 优化器配置
syntax = "proto3";
package tensorflow;
import "tensorflow/core/framework/cost_graph.proto";
import "tensorflow/core/framework/graph.proto";
import "tensorflow/core/framework/step_stats.proto";
import "tensorflow/core/protobuf/cluster.proto";
import "tensorflow/core/protobuf/coordination_config.proto";
import "tensorflow/core/protobuf/debug.proto";
import "tensorflow/core/protobuf/rewriter_config.proto";
option cc_enable_arenas = true;
option java_outer_classname = "ConfigProtos";
option java_multiple_files = true;
option java_package = "org.tensorflow.framework";
option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
message GPUOptions {
//控制占用GPU内存的百分比
double per_process_gpu_memory_fraction = 1;
//按需求逐渐占有内存
bool allow_growth = 4;
//GPU内存分配策略:空表示默认。 "BFS"表示用最佳适应算法
string allocator_type = 2;
//延迟释放内存大小,好处是避免频繁与GPU驱动交互。 0表示系统默认,可能是几MB
int64 deferred_deletion_bytes = 3;
//可见的GPU设备列表:CUDA_VISIBLE_DEVICES, /device:GPU:<id>
//如系统有8个GPU,想把5,3映射成"/device:GPU:0", and "/device:GPU:1"
//visible_device_list =5,3
string visible_device_list = 5;
int32 polling_active_delay_usecs = 6;
int32 polling_inactive_delay_msecs = 7;
bool force_gpu_compatible = 8;
message Experimental {
message VirtualDevices {
repeated float memory_limit_mb = 1;
repeated int32 priority = 2;
}
repeated VirtualDevices virtual_devices = 1;
bool use_unified_memory = 2;
int32 num_dev_to_dev_copy_streams = 3;
string collective_ring_order = 4;
bool timestamped_allocator = 5;
// reserved id: 6
int32 kernel_tracker_max_interval = 7;
int32 kernel_tracker_max_bytes = 8;
int32 kernel_tracker_max_pending = 9;
double internal_fragmentation_fraction = 10;
bool use_cuda_malloc_async = 11;
bool disallow_retry_on_allocation_failure = 12;
}
Experimental experimental = 9;
}
// Options passed to the graph optimizer
message OptimizerOptions {
bool do_common_subexpression_elimination = 1;
bool do_constant_folding = 2;
int64 max_folded_constant_in_bytes = 6;
bool do_function_inlining = 4;
// Optimization level
enum Level {
// L1 is the default level.
// Optimization performed at L1 :
// 1. Common subexpression elimination
// 2. Constant folding
L1 = 0;
// No optimizations
L0 = -1;
}
Level opt_level = 3;
// Control the use of the compiler/jit. Experimental.
enum GlobalJitLevel {
DEFAULT = 0; // Default setting ("off" now, but later expected to be "on")
OFF = -1;
ON_1 = 1;
ON_2 = 2;
}
GlobalJitLevel global_jit_level = 5;
bool cpu_global_jit = 7;
}
message GraphOptions {
reserved "skip_common_subexpression_elimination";
reserved 1;
bool enable_recv_scheduling = 2;
OptimizerOptions optimizer_options = 3;
int64 build_cost_model = 4;
int64 build_cost_model_after = 9;
bool infer_shapes = 5;
bool place_pruned_graph = 6;
bool enable_bfloat16_sendrecv = 7;
int32 timeline_step = 8;
RewriterConfig rewrite_options = 10;
}
message ThreadPoolOptionProto {
int32 num_threads = 1;
//通过名字能在多个session之间共享thread pool
string global_name = 2;
}
message RPCOptions {
//可以强制使用RPC.方便本地调度RPC
bool use_rpc_for_inprocess_master = 1;
// 压缩算法: "deflate", "gzip".
string compression_algorithm = 2;
//压缩级别0到3
int32 compression_level = 3;
bool cache_rpc_response = 4;
// Disables TCP connection sharing when opening a new RPC channel.
bool disable_session_connection_sharing = 5;
//与target建立多个通道,在网络带宽较大时能占用更多网络资源
int32 num_channels_per_target = 6;
}
//用于runtime和ops的debug和监控
//(name,version)是一个元组,能在一个进程中唯一确定一个session
message SessionMetadata {
string name = 1;
// The version is optional. If set, needs to be >= 0.
int64 version = 2;
}
message ConfigProto {
//设备名到最大设备数:cpu->20 表示有20个CPU
map<string, int32> device_count = 1;
int32 intra_op_parallelism_threads = 2;
int32 inter_op_parallelism_threads = 5;
bool use_per_session_threads = 9;
repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12;
int32 placement_period = 3;
repeated string device_filters = 4;
GPUOptions gpu_options = 6;
//如果没有GPU就用CPU
bool allow_soft_placement = 7;
// 打出放置到设备的事件日志信息
bool log_device_placement = 8;
//图配置
GraphOptions graph_options = 10;
int64 operation_timeout_in_ms = 11;
RPCOptions rpc_options = 13;
//分布式情况下的ClusterInfo
ClusterDef cluster_def = 14;
bool isolate_session_state = 15;
bool share_cluster_devices_in_session = 17;
message Experimental {
string collective_group_leader = 1;
reserved 2;
string executor_type = 3;
int32 recv_buf_max_chunk = 4;
bool use_numa_affinity = 5;
bool collective_deterministic_sequential_execution = 6;
bool collective_nccl = 7;
bool share_session_state_in_clusterspec_propagation = 8;
bool disable_thread_spinning = 9;
bool share_cluster_devices_in_session = 10;
SessionMetadata session_metadata = 11;
bool optimize_for_static_graph = 12;
bool enable_mlir_bridge = 13;
// An enum that describes the state of the MLIR bridge rollout.
enum MlirBridgeRollout {
MLIR_BRIDGE_ROLLOUT_UNSPECIFIED = 0;
MLIR_BRIDGE_ROLLOUT_ENABLED = 1;
MLIR_BRIDGE_ROLLOUT_DISABLED = 2;
MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED = 3;
MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED = 4;
}
MlirBridgeRollout mlir_bridge_rollout = 17;
bool enable_mlir_graph_optimization = 16;
bool disable_output_partition_graphs = 14;
int64 xla_fusion_autotuner_thresh = 15;
bool use_tfrt = 18;
reserved 19;
reserved 20;
bool disable_functional_ops_lowering = 21;
bool xla_prefer_single_graph_cluster = 22;
CoordinationServiceConfig coordination_config = 23;
}
Experimental experimental = 16;
// Next: 18
}
// Options for a single Run() call.
message RunOptions {
enum TraceLevel {
NO_TRACE = 0;
SOFTWARE_TRACE = 1;
HARDWARE_TRACE = 2;
FULL_TRACE = 3;
}
TraceLevel trace_level = 1;
// Time to wait for operation to complete in milliseconds.
int64 timeout_in_ms = 2;
int32 inter_op_thread_pool = 3;
bool output_partition_graphs = 5;
// EXPERIMENTAL. Options used to initialize DebuggerState, if enabled.
DebugOptions debug_options = 6;
bool report_tensor_allocations_upon_oom = 7;
message Experimental {
int64 collective_graph_key = 1;
bool use_run_handler_pool = 2;
message RunHandlerPoolOptions {
int64 priority = 1;
}
RunHandlerPoolOptions run_handler_pool_options = 3;
}
Experimental experimental = 8;
reserved 4;
}
message RunMetadata {
StepStats step_stats = 1;
CostGraphDef cost_graph = 2;
repeated GraphDef partition_graphs = 3;
message FunctionGraphs {
// TODO(nareshmodi): Include some sort of function/cache-key identifier?
repeated GraphDef partition_graphs = 1;
GraphDef pre_optimization_graph = 2;
GraphDef post_optimization_graph = 3;
}
repeated FunctionGraphs function_graphs = 4;
}
// Defines a connection between two tensors in a `GraphDef`.
message TensorConnection {
// A tensor name. The value of this tensor will be substituted for
// the tensor named in `to_tensor`.
string from_tensor = 1;
// A tensor name. The value of this tensor will be bound to the
// value of the tensor named in `from_tensor`.
string to_tensor = 2;
}
// Defines a subgraph in another `GraphDef` as a set of feed points and nodes
// to be fetched or executed.
//
// Compare with the arguments to `Session::Run()`.
message CallableOptions {
// Tensors to be fed in the callable. Each feed is the name of a tensor.
repeated string feed = 1;
// Fetches. A list of tensor names. The caller of the callable expects a
// tensor to be returned for each fetch[i] (see RunStepResponse.tensor). The
// order of specified fetches does not change the execution order.
repeated string fetch = 2;
// Target Nodes. A list of node names. The named nodes will be run by the
// callable but their outputs will not be returned.
repeated string target = 3;
// Options that will be applied to each run.
RunOptions run_options = 4;
repeated TensorConnection tensor_connection = 5;
// CallableOptions {
// feed: "a:0"
// feed: "b:0"
//
// fetch: "x:0"
// fetch: "y:0"
//
// feed_devices: {
// "a:0": "/job:localhost/replica:0/task:0/device:GPU:0"
// }
//
// fetch_devices: {
// "y:0": "/job:localhost/replica:0/task:0/device:GPU:0"
// }
// }
//
// means that the Callable expects:
// - The first argument ("a:0") is a Tensor backed by GPU memory.
// - The second argument ("b:0") is a Tensor backed by host memory.
// and of its return values:
// - The first output ("x:0") will be backed by host memory.
// - The second output ("y:0") will be backed by GPU memory.
//
map<string, string> feed_devices = 6;
map<string, string> fetch_devices = 7;
bool fetch_skip_sync = 8;
// Next: 9
}
cluster.proto
用于分布式训练,指定好集群中所有机器的Ip和端口
// This file contains protos to be used when defining a TensorFlow
// cluster.
//
// EXAMPLES
// --------
//
// 1. A single-process cluster, containing "/job:local/task:0".
//
// Cluster:
// job { name: 'local' tasks { key: 0 value: 'localhost:2222' } }
//
// Server:
// cluster { $CLUSTER } job_name: 'local' task_index: 0
//
// 2. A two-process cluster, containing "/job:local/task:{0,1}".
//
// Cluster:
// job { name: 'local' tasks { key: 0 value: 'localhost:2222' }
// tasks { key: 1 value: 'localhost:2223' } }
//
// Servers:
// cluster { $CLUSTER } job_name: 'local' task_index: 0
// cluster { $CLUSTER } job_name: 'local' task_index: 1
//
// 3. A two-job cluster, containing "/job:worker/task:{0,1,2}" and
// "/job:ps/task:{0,1}".
//
// Cluster:
// job { name: 'worker' tasks { key: 0 value: 'worker1:2222' }
// tasks { key: 1 value: 'worker2:2222' }
// tasks { key: 2 value: 'worker3:2222' } }
// job { name: 'ps' tasks { key: 0 value: 'ps0:2222' }
// tasks { key: 1 value: 'ps1:2222' } }
//
// Servers:
// cluster { $CLUSTER } job_name: 'worker' task_index: 0
// cluster { $CLUSTER } job_name: 'worker' task_index: 1
// cluster { $CLUSTER } job_name: 'worker' task_index: 2
// cluster { $CLUSTER } job_name: 'ps' task_index: 0
// cluster { $CLUSTER } job_name: 'ps' task_index: 1
// Defines a single job in a TensorFlow cluster.
message JobDef {
// The name of this job.
string name = 1;
// Mapping from task ID to "hostname:port" string.
//
// If the `name` field contains "worker", and the `tasks` map contains a
// mapping from 7 to "example.org:2222", then the device prefix
// "/job:worker/task:7" will be assigned to "example.org:2222".
map<int32, string> tasks = 2;
}
// Defines a TensorFlow cluster as a set of jobs.
message ClusterDef {
// The jobs that comprise the cluster.
repeated JobDef job = 1;
}
python中定义cluster
定义在tensorflow/python/trainning/server_lib.py中
这是python接口,用于生成集群信息。
cluster=tf.train.ClusterSpec({
"worker": [
"127.0.0.1:2222",
],
"ps": [
"127.0.0.1:3333",
"127.0.0.1:4444",
]})
@tf_export("train.ClusterSpec")
class ClusterSpec(object):
"""Represents a cluster as a set of "tasks", organized into "jobs".
A `tf.train.ClusterSpec` represents the set of processes that
participate in a distributed TensorFlow computation. Every
`tf.train.Server` is constructed in a particular cluster.
To create a cluster with two jobs and five tasks, you specify the
mapping from job names to lists of network addresses (typically
hostname-port pairs).
```python
cluster = tf.train.ClusterSpec({"worker": ["worker0.example.com:2222",
"worker1.example.com:2222",
"worker2.example.com:2222"],
"ps": ["ps0.example.com:2222",
"ps1.example.com:2222"]})
```
Each job may also be specified as a sparse mapping from task indices
to network addresses. This enables a server to be configured without
needing to know the identity of (for example) all other worker
tasks:
```python
cluster = tf.train.ClusterSpec({"worker": {1: "worker1.example.com:2222"},
"ps": ["ps0.example.com:2222",
"ps1.example.com:2222"]})
```
"""
文章出处登录后可见!
已经登录?立即刷新