tensorflow之session_options/ConfigProto

创建session时可以传入session_options.

C++实现在:tensorflow/core/public/session_options.h中

Env:执行环境

target: 也就是要在哪里执行图,可以是本地,也可以连接到远程server上。

ConfigProto是最核心的配置,其他功能都在此配置里

struct SessionOptions {
  Env* env;  

  target是:local, ip:port, host:port这三种,也可以是多个
  std::string target;

  /// Configuration options.
  ConfigProto config;

  SessionOptions();
};

ConfigProto

ConfigProto是用proto定义的:tensorflow/core/protobuf/config.proto

其中包含了很多配置

  1. GPU配置
  2. GRPC配置
  3. 图配置
  4. 设备配置:映射,放置
  5. 分布式集群配置
  6. 线程池配置
  7. sess.run配置
  8. debug配置
  9. 优化器配置
syntax = "proto3";

package tensorflow;

import "tensorflow/core/framework/cost_graph.proto";
import "tensorflow/core/framework/graph.proto";
import "tensorflow/core/framework/step_stats.proto";
import "tensorflow/core/protobuf/cluster.proto";
import "tensorflow/core/protobuf/coordination_config.proto";
import "tensorflow/core/protobuf/debug.proto";
import "tensorflow/core/protobuf/rewriter_config.proto";

option cc_enable_arenas = true;
option java_outer_classname = "ConfigProtos";
option java_multiple_files = true;
option java_package = "org.tensorflow.framework";
option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";

message GPUOptions {
  //控制占用GPU内存的百分比
  double per_process_gpu_memory_fraction = 1;

  //按需求逐渐占有内存
  bool allow_growth = 4;

  //GPU内存分配策略:空表示默认。 "BFS"表示用最佳适应算法
  string allocator_type = 2;

  //延迟释放内存大小,好处是避免频繁与GPU驱动交互。 0表示系统默认,可能是几MB
  int64 deferred_deletion_bytes = 3;

  //可见的GPU设备列表:CUDA_VISIBLE_DEVICES, /device:GPU:<id>
  //如系统有8个GPU,想把5,3映射成"/device:GPU:0", and "/device:GPU:1"
  //visible_device_list =5,3
  string visible_device_list = 5;


  int32 polling_active_delay_usecs = 6;
  int32 polling_inactive_delay_msecs = 7;
  bool force_gpu_compatible = 8;

  message Experimental {
    
    message VirtualDevices {
      repeated float memory_limit_mb = 1;
      repeated int32 priority = 2;
    }
    repeated VirtualDevices virtual_devices = 1;
    bool use_unified_memory = 2;
    int32 num_dev_to_dev_copy_streams = 3;
    string collective_ring_order = 4;
    bool timestamped_allocator = 5;

    // reserved id: 6

    int32 kernel_tracker_max_interval = 7;
   
    int32 kernel_tracker_max_bytes = 8;

    int32 kernel_tracker_max_pending = 9;

    double internal_fragmentation_fraction = 10;

    bool use_cuda_malloc_async = 11;

    bool disallow_retry_on_allocation_failure = 12;
  }

  Experimental experimental = 9;
}

// Options passed to the graph optimizer
message OptimizerOptions {
  bool do_common_subexpression_elimination = 1;
  bool do_constant_folding = 2;
  int64 max_folded_constant_in_bytes = 6;
  bool do_function_inlining = 4;

  // Optimization level
  enum Level {
    // L1 is the default level.
    // Optimization performed at L1 :
    // 1. Common subexpression elimination
    // 2. Constant folding
    L1 = 0;

    // No optimizations
    L0 = -1;
  }
  Level opt_level = 3;

  // Control the use of the compiler/jit.  Experimental.
  enum GlobalJitLevel {
    DEFAULT = 0;  // Default setting ("off" now, but later expected to be "on")
    OFF = -1;
    ON_1 = 1;
    ON_2 = 2;
  }
  GlobalJitLevel global_jit_level = 5;
  bool cpu_global_jit = 7;
}

message GraphOptions {
  reserved "skip_common_subexpression_elimination";
  reserved 1;
  bool enable_recv_scheduling = 2;
  OptimizerOptions optimizer_options = 3;
  int64 build_cost_model = 4;
  int64 build_cost_model_after = 9;
  bool infer_shapes = 5;
  bool place_pruned_graph = 6;
  bool enable_bfloat16_sendrecv = 7;
  int32 timeline_step = 8;
  RewriterConfig rewrite_options = 10;
}

message ThreadPoolOptionProto {
  int32 num_threads = 1;
  //通过名字能在多个session之间共享thread pool
  string global_name = 2;
}

message RPCOptions {
  //可以强制使用RPC.方便本地调度RPC
  bool use_rpc_for_inprocess_master = 1;

  // 压缩算法: "deflate", "gzip".
  string compression_algorithm = 2;
  //压缩级别0到3
  int32 compression_level = 3;

  bool cache_rpc_response = 4;

  // Disables TCP connection sharing when opening a new RPC channel.
  bool disable_session_connection_sharing = 5;

  //与target建立多个通道,在网络带宽较大时能占用更多网络资源
  int32 num_channels_per_target = 6;
}


//用于runtime和ops的debug和监控
//(name,version)是一个元组,能在一个进程中唯一确定一个session
message SessionMetadata {
  string name = 1;

  // The version is optional. If set, needs to be >= 0.
  int64 version = 2;
}


message ConfigProto {
  //设备名到最大设备数:cpu->20  表示有20个CPU
  map<string, int32> device_count = 1;

  int32 intra_op_parallelism_threads = 2;

  int32 inter_op_parallelism_threads = 5;

  bool use_per_session_threads = 9;

  repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12;

  int32 placement_period = 3;

  repeated string device_filters = 4;

  GPUOptions gpu_options = 6;

  //如果没有GPU就用CPU
  bool allow_soft_placement = 7;

  // 打出放置到设备的事件日志信息
  bool log_device_placement = 8;

  //图配置
  GraphOptions graph_options = 10;

  int64 operation_timeout_in_ms = 11;

  RPCOptions rpc_options = 13;

  //分布式情况下的ClusterInfo

  ClusterDef cluster_def = 14;

  bool isolate_session_state = 15;

  bool share_cluster_devices_in_session = 17;

  message Experimental {
    string collective_group_leader = 1;
    reserved 2;
    string executor_type = 3;
    int32 recv_buf_max_chunk = 4;
    bool use_numa_affinity = 5;
    bool collective_deterministic_sequential_execution = 6;
    bool collective_nccl = 7;
    bool share_session_state_in_clusterspec_propagation = 8;
    bool disable_thread_spinning = 9;
    bool share_cluster_devices_in_session = 10;
    SessionMetadata session_metadata = 11;
    bool optimize_for_static_graph = 12;
    bool enable_mlir_bridge = 13;

    // An enum that describes the state of the MLIR bridge rollout.
    enum MlirBridgeRollout {
      MLIR_BRIDGE_ROLLOUT_UNSPECIFIED = 0;
      MLIR_BRIDGE_ROLLOUT_ENABLED = 1;
      MLIR_BRIDGE_ROLLOUT_DISABLED = 2;
      MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED = 3;
      MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED = 4;
    }
    MlirBridgeRollout mlir_bridge_rollout = 17;
    bool enable_mlir_graph_optimization = 16;
    bool disable_output_partition_graphs = 14;
    int64 xla_fusion_autotuner_thresh = 15;
    bool use_tfrt = 18;
    reserved 19;
    reserved 20;
    bool disable_functional_ops_lowering = 21;
    bool xla_prefer_single_graph_cluster = 22;
    CoordinationServiceConfig coordination_config = 23;
  }

  Experimental experimental = 16;

  // Next: 18
}

// Options for a single Run() call.
message RunOptions {
  enum TraceLevel {
    NO_TRACE = 0;
    SOFTWARE_TRACE = 1;
    HARDWARE_TRACE = 2;
    FULL_TRACE = 3;
  }
  TraceLevel trace_level = 1;

  // Time to wait for operation to complete in milliseconds.
  int64 timeout_in_ms = 2;

  int32 inter_op_thread_pool = 3;

  bool output_partition_graphs = 5;

  // EXPERIMENTAL.  Options used to initialize DebuggerState, if enabled.
  DebugOptions debug_options = 6;

  bool report_tensor_allocations_upon_oom = 7;
  message Experimental {
    int64 collective_graph_key = 1;
    bool use_run_handler_pool = 2;
    message RunHandlerPoolOptions {
      int64 priority = 1;
    }
    RunHandlerPoolOptions run_handler_pool_options = 3;
  }

  Experimental experimental = 8;

  reserved 4;
}


message RunMetadata {
  StepStats step_stats = 1;

  CostGraphDef cost_graph = 2;

  repeated GraphDef partition_graphs = 3;

  message FunctionGraphs {
    // TODO(nareshmodi): Include some sort of function/cache-key identifier?
    repeated GraphDef partition_graphs = 1;

    GraphDef pre_optimization_graph = 2;
    GraphDef post_optimization_graph = 3;
  }
  repeated FunctionGraphs function_graphs = 4;
}

// Defines a connection between two tensors in a `GraphDef`.
message TensorConnection {
  // A tensor name. The value of this tensor will be substituted for
  // the tensor named in `to_tensor`.
  string from_tensor = 1;

  // A tensor name. The value of this tensor will be bound to the
  // value of the tensor named in `from_tensor`.
  string to_tensor = 2;
}

// Defines a subgraph in another `GraphDef` as a set of feed points and nodes
// to be fetched or executed.
//
// Compare with the arguments to `Session::Run()`.
message CallableOptions {
  // Tensors to be fed in the callable. Each feed is the name of a tensor.
  repeated string feed = 1;

  // Fetches. A list of tensor names. The caller of the callable expects a
  // tensor to be returned for each fetch[i] (see RunStepResponse.tensor). The
  // order of specified fetches does not change the execution order.
  repeated string fetch = 2;

  // Target Nodes. A list of node names. The named nodes will be run by the
  // callable but their outputs will not be returned.
  repeated string target = 3;

  // Options that will be applied to each run.
  RunOptions run_options = 4;
  repeated TensorConnection tensor_connection = 5;
  // CallableOptions {
  //   feed: "a:0"
  //   feed: "b:0"
  //
  //   fetch: "x:0"
  //   fetch: "y:0"
  //
  //   feed_devices: {
  //     "a:0": "/job:localhost/replica:0/task:0/device:GPU:0"
  //   }
  //
  //   fetch_devices: {
  //     "y:0": "/job:localhost/replica:0/task:0/device:GPU:0"
  //  }
  // }
  //
  // means that the Callable expects:
  // - The first argument ("a:0") is a Tensor backed by GPU memory.
  // - The second argument ("b:0") is a Tensor backed by host memory.
  // and of its return values:
  // - The first output ("x:0") will be backed by host memory.
  // - The second output ("y:0") will be backed by GPU memory.
  //

  map<string, string> feed_devices = 6;
  map<string, string> fetch_devices = 7;
  bool fetch_skip_sync = 8;

  // Next: 9
}

cluster.proto

用于分布式训练,指定好集群中所有机器的Ip和端口


// This file contains protos to be used when defining a TensorFlow
// cluster.
//
// EXAMPLES
// --------
//
// 1. A single-process cluster, containing "/job:local/task:0".
//
//    Cluster:
//      job { name: 'local' tasks { key: 0 value: 'localhost:2222' } }
//
//    Server:
//      cluster { $CLUSTER } job_name: 'local' task_index: 0
//
// 2. A two-process cluster, containing "/job:local/task:{0,1}".
//
//    Cluster:
//      job { name: 'local' tasks { key: 0 value: 'localhost:2222' }
//                          tasks { key: 1 value: 'localhost:2223' } }
//
//    Servers:
//      cluster { $CLUSTER } job_name: 'local' task_index: 0
//      cluster { $CLUSTER } job_name: 'local' task_index: 1
//
// 3. A two-job cluster, containing "/job:worker/task:{0,1,2}" and
//    "/job:ps/task:{0,1}".
//
//    Cluster:
//      job { name: 'worker' tasks { key: 0 value: 'worker1:2222' }
//                           tasks { key: 1 value: 'worker2:2222' }
//                           tasks { key: 2 value: 'worker3:2222' } }
//      job { name: 'ps'     tasks { key: 0 value: 'ps0:2222' }
//                           tasks { key: 1 value: 'ps1:2222' } }
//
//    Servers:
//      cluster { $CLUSTER } job_name: 'worker' task_index: 0
//      cluster { $CLUSTER } job_name: 'worker' task_index: 1
//      cluster { $CLUSTER } job_name: 'worker' task_index: 2
//      cluster { $CLUSTER } job_name: 'ps'     task_index: 0
//      cluster { $CLUSTER } job_name: 'ps'     task_index: 1

// Defines a single job in a TensorFlow cluster.
message JobDef {
  // The name of this job.
  string name = 1;

  // Mapping from task ID to "hostname:port" string.
  //
  // If the `name` field contains "worker", and the `tasks` map contains a
  // mapping from 7 to "example.org:2222", then the device prefix
  // "/job:worker/task:7" will be assigned to "example.org:2222".
  map<int32, string> tasks = 2;
}

// Defines a TensorFlow cluster as a set of jobs.
message ClusterDef {
  // The jobs that comprise the cluster.
  repeated JobDef job = 1;
}

python中定义cluster


定义在tensorflow/python/trainning/server_lib.py中

这是python接口,用于生成集群信息。

cluster=tf.train.ClusterSpec({
    "worker": [
    "127.0.0.1:2222",
    ],
    "ps": [
    "127.0.0.1:3333",
    "127.0.0.1:4444",
    ]})



@tf_export("train.ClusterSpec")
class ClusterSpec(object):
  """Represents a cluster as a set of "tasks", organized into "jobs".

  A `tf.train.ClusterSpec` represents the set of processes that
  participate in a distributed TensorFlow computation. Every
  `tf.train.Server` is constructed in a particular cluster.

  To create a cluster with two jobs and five tasks, you specify the
  mapping from job names to lists of network addresses (typically
  hostname-port pairs).

  ```python
  cluster = tf.train.ClusterSpec({"worker": ["worker0.example.com:2222",
                                             "worker1.example.com:2222",
                                             "worker2.example.com:2222"],
                                  "ps": ["ps0.example.com:2222",
                                         "ps1.example.com:2222"]})
  ```

  Each job may also be specified as a sparse mapping from task indices
  to network addresses. This enables a server to be configured without
  needing to know the identity of (for example) all other worker
  tasks:

  ```python
  cluster = tf.train.ClusterSpec({"worker": {1: "worker1.example.com:2222"},
                                  "ps": ["ps0.example.com:2222",
                                         "ps1.example.com:2222"]})
  ```
  """

文章出处登录后可见!

已经登录?立即刷新

共计人评分,平均

到目前为止还没有投票!成为第一位评论此文章。

(0)
青葱年少的头像青葱年少普通用户
上一篇 2022年5月19日
下一篇 2022年5月19日

相关推荐