Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Fix pai reuse mode #4027

Merged
merged 28 commits into from
Aug 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions nni/experiment/config/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
def to_v2(v1) -> ExperimentConfig:
v1 = copy.deepcopy(v1)
platform = v1.pop('trainingServicePlatform')
assert platform in ['local', 'remote', 'openpai', 'aml']
assert platform in ['local', 'remote', 'pai', 'aml']
if platform == 'pai':
platform = 'openpai'
v2 = ExperimentConfig(platform)

_drop_field(v1, 'authorName')
Expand Down Expand Up @@ -88,7 +90,7 @@ def to_v2(v1) -> ExperimentConfig:
if 'memoryMB' in v1_trial:
ts.trial_memory_size = str(v1_trial.pop('memoryMB')) + 'mb'
_move_field(v1_trial, ts, 'image', 'docker_image')
_deprecate(v1_trial, v2, 'virtualCluster')
_move_field(v1_trial, ts, 'virtualCluster', 'virtual_cluster')
_move_field(v1_trial, ts, 'paiStorageConfigName', 'storage_config_name')
_move_field(v1_trial, ts, 'paiConfigPath', 'openpaiConfigFile')

Expand Down
1 change: 1 addition & 0 deletions nni/experiment/config/openpai.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class OpenpaiConfig(TrainingServiceConfig):
trial_memory_size: str
storage_config_name: str
docker_image: str = 'msranni/nni:latest'
virtual_cluster: Optional[str]
local_storage_mount_point: PathLike
container_storage_mount_point: str
reuse_mode: bool = True
Expand Down
3 changes: 2 additions & 1 deletion ts/nni_manager/common/experimentConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ export interface OpenpaiConfig extends TrainingServiceConfig {
containerStorageMountPoint: string;
reuseMode: boolean;
openpaiConfig?: object;
virtualCluster?: string;
}

/* AML */
Expand Down Expand Up @@ -198,7 +199,7 @@ export function toSeconds(time: string): number {
throw new Error(`Bad time string "${time}"`);
}

const sizeUnits = { tb: 1024 * 1024, gb: 1024 * 1024, mb: 1, kb: 1 / 1024 };
const sizeUnits = { tb: 1024 * 1024, gb: 1024, mb: 1, kb: 1 / 1024 };

export function toMegaBytes(size: string): number {
for (const [unit, factor] of Object.entries(sizeUnits)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import * as yaml from 'js-yaml';
import * as request from 'request';
import { Container, Scope } from 'typescript-ioc';
import { Deferred } from 'ts-deferred';
import * as component from '../../../common/component';
import { ExperimentConfig, OpenpaiConfig, flattenConfig, toMegaBytes } from '../../../common/experimentConfig';
Expand All @@ -15,6 +16,7 @@ import { NNIPAITrialConfig } from '../../pai/paiConfig';
import { EnvironmentInformation, EnvironmentService } from '../environment';
import { SharedStorageService } from '../sharedStorage';
import { MountedStorageService } from '../storages/mountedStorageService';
import { StorageService } from '../storageService';

interface FlattenOpenpaiConfig extends ExperimentConfig, OpenpaiConfig { }

Expand All @@ -38,9 +40,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
this.config = flattenConfig(config, 'openpai');
this.paiToken = this.config.token;
this.protocol = this.config.host.toLowerCase().startsWith('https://') ? 'https' : 'http';

// FIXME: only support MountedStorageService
const storageService = new MountedStorageService();
Container.bind(StorageService)
.to(MountedStorageService)
.scope(Scope.Singleton);
const storageService = component.get<StorageService>(StorageService)
const remoteRoot = storageService.joinPath(this.config.localStorageMountPoint, this.experimentId);
storageService.initialize(this.config.localStorageMountPoint, remoteRoot);
}
Expand Down Expand Up @@ -286,7 +289,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
taskRetryCount: 0,
dockerImage: 'docker_image_0',
resourcePerInstance: {
gpu: this.config.trialGpuNumber,
gpu: this.config.trialGpuNumber === undefined? 0: this.config.trialGpuNumber,
cpu: this.config.trialCpuNumber,
memoryMB: toMegaBytes(this.config.trialMemorySize)
},
Expand All @@ -304,9 +307,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
submitFrom: 'submit-job-v2'
}
}
if (this.config.deprecated && this.config.deprecated.virtualCluster) {
if (this.config.virtualCluster) {
nniJobConfig.defaults = {
virtualCluster: this.config.deprecated.virtualCluster
virtualCluster: this.config.virtualCluster
}
}
}
Expand Down