oahzxl commited on
Commit
a28e78a
1 Parent(s): a112eac
README.md CHANGED
@@ -11,10 +11,11 @@ app_port: 7860
11
  app_file: app.py
12
  models:
13
  - THUDM/CogVideoX-2b
 
14
  tags:
15
  - cogvideox
16
  - video-generation
17
- - thudm
18
  short_description: Text-to-Video
19
  disable_embedding: false
20
- ---
 
11
  app_file: app.py
12
  models:
13
  - THUDM/CogVideoX-2b
14
+ - THUDM/CogVideoX-5b
15
  tags:
16
  - cogvideox
17
  - video-generation
18
+ - videosys
19
  short_description: Text-to-Video
20
  disable_embedding: false
21
+ ---
app.py CHANGED
@@ -13,9 +13,9 @@ import spaces
13
  from videosys import CogVideoXConfig, CogVideoXPABConfig, VideoSysEngine
14
 
15
 
16
- def load_model(enable_video_sys=False, pab_threshold=[100, 850], pab_range=2):
17
  pab_config = CogVideoXPABConfig(spatial_threshold=pab_threshold, spatial_range=pab_range)
18
- config = CogVideoXConfig(world_size=1, enable_pab=enable_video_sys, pab_config=pab_config)
19
  engine = VideoSysEngine(config)
20
  return engine
21
 
@@ -50,15 +50,16 @@ def get_server_status():
50
  return {"cpu": f"{cpu_percent}%", "memory": f"{memory.percent}%", "disk": f"{disk.percent}%", "gpu": gpu_info}
51
 
52
 
53
- @spaces.GPU(duration=240)
54
- def generate_vanilla(prompt, num_inference_steps, guidance_scale, progress=gr.Progress(track_tqdm=True)):
55
- engine = load_model()
56
  video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
57
  return video_path
58
 
59
 
60
- @spaces.GPU(duration=240)
61
  def generate_vs(
 
62
  prompt,
63
  num_inference_steps,
64
  guidance_scale,
@@ -69,7 +70,7 @@ def generate_vs(
69
  ):
70
  threshold = [int(threshold_end), int(threshold_start)]
71
  gap = int(gap)
72
- engine = load_model(enable_video_sys=True, pab_threshold=threshold, pab_range=gap)
73
  video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
74
  return video_path
75
 
@@ -202,10 +203,14 @@ with gr.Blocks(css=css) as demo:
202
 
203
  with gr.Row():
204
  with gr.Column():
205
- prompt = gr.Textbox(label="Prompt (Less than 200 Words)", value="Sunset over the sea.", lines=4)
206
 
207
  with gr.Column():
208
  gr.Markdown("**Generation Parameters**<br>")
 
 
 
 
209
  with gr.Row():
210
  num_inference_steps = gr.Number(label="Inference Steps", value=50)
211
  guidance_scale = gr.Number(label="Guidance Scale", value=6.0)
@@ -240,7 +245,7 @@ with gr.Blocks(css=css) as demo:
240
 
241
  generate_button.click(
242
  generate_vanilla,
243
- inputs=[prompt, num_inference_steps, guidance_scale],
244
  outputs=[video_output],
245
  concurrency_id="gen",
246
  concurrency_limit=1,
@@ -248,7 +253,15 @@ with gr.Blocks(css=css) as demo:
248
 
249
  generate_button_vs.click(
250
  generate_vs,
251
- inputs=[prompt, num_inference_steps, guidance_scale, pab_threshold_start, pab_threshold_end, pab_range],
 
 
 
 
 
 
 
 
252
  outputs=[video_output_vs],
253
  concurrency_id="gen",
254
  concurrency_limit=1,
 
13
  from videosys import CogVideoXConfig, CogVideoXPABConfig, VideoSysEngine
14
 
15
 
16
+ def load_model(model_name, enable_video_sys=False, pab_threshold=[100, 850], pab_range=2):
17
  pab_config = CogVideoXPABConfig(spatial_threshold=pab_threshold, spatial_range=pab_range)
18
+ config = CogVideoXConfig(model_name, enable_pab=enable_video_sys, pab_config=pab_config)
19
  engine = VideoSysEngine(config)
20
  return engine
21
 
 
50
  return {"cpu": f"{cpu_percent}%", "memory": f"{memory.percent}%", "disk": f"{disk.percent}%", "gpu": gpu_info}
51
 
52
 
53
+ @spaces.GPU(duration=400)
54
+ def generate_vanilla(model_name, prompt, num_inference_steps, guidance_scale, progress=gr.Progress(track_tqdm=True)):
55
+ engine = load_model(model_name)
56
  video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
57
  return video_path
58
 
59
 
60
+ @spaces.GPU(duration=360)
61
  def generate_vs(
62
+ model_name,
63
  prompt,
64
  num_inference_steps,
65
  guidance_scale,
 
70
  ):
71
  threshold = [int(threshold_end), int(threshold_start)]
72
  gap = int(gap)
73
+ engine = load_model(model_name, enable_video_sys=True, pab_threshold=threshold, pab_range=gap)
74
  video_path = generate(engine, prompt, num_inference_steps, guidance_scale)
75
  return video_path
76
 
 
203
 
204
  with gr.Row():
205
  with gr.Column():
206
+ prompt = gr.Textbox(label="Prompt (Less than 200 Words)", value="Sunset over the sea.", lines=3)
207
 
208
  with gr.Column():
209
  gr.Markdown("**Generation Parameters**<br>")
210
+ with gr.Row():
211
+ model_name = gr.Dropdown(
212
+ ["THUDM/CogVideoX-2b", "THUDM/CogVideoX-5b"], label="Model Type", value="THUDM/CogVideoX-2b"
213
+ )
214
  with gr.Row():
215
  num_inference_steps = gr.Number(label="Inference Steps", value=50)
216
  guidance_scale = gr.Number(label="Guidance Scale", value=6.0)
 
245
 
246
  generate_button.click(
247
  generate_vanilla,
248
+ inputs=[model_name, prompt, num_inference_steps, guidance_scale],
249
  outputs=[video_output],
250
  concurrency_id="gen",
251
  concurrency_limit=1,
 
253
 
254
  generate_button_vs.click(
255
  generate_vs,
256
+ inputs=[
257
+ model_name,
258
+ prompt,
259
+ num_inference_steps,
260
+ guidance_scale,
261
+ pab_threshold_start,
262
+ pab_threshold_end,
263
+ pab_range,
264
+ ],
265
  outputs=[video_output_vs],
266
  concurrency_id="gen",
267
  concurrency_limit=1,
videosys/core/engine.py CHANGED
@@ -20,7 +20,7 @@ class VideoSysEngine:
20
  self._init_worker(config.pipeline_cls)
21
 
22
  def _init_worker(self, pipeline_cls):
23
- world_size = self.config.world_size
24
 
25
  if "CUDA_VISIBLE_DEVICES" not in os.environ:
26
  os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(world_size))
@@ -68,7 +68,7 @@ class VideoSysEngine:
68
 
69
  # TODO: add more options here for pipeline, or wrap all options into config
70
  def _create_pipeline(self, pipeline_cls, rank=0, local_rank=0, distributed_init_method=None):
71
- videosys.initialize(rank=rank, world_size=self.config.world_size, init_method=distributed_init_method, seed=42)
72
 
73
  pipeline = pipeline_cls(self.config)
74
  return pipeline
 
20
  self._init_worker(config.pipeline_cls)
21
 
22
  def _init_worker(self, pipeline_cls):
23
+ world_size = self.config.num_gpus
24
 
25
  if "CUDA_VISIBLE_DEVICES" not in os.environ:
26
  os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(world_size))
 
68
 
69
  # TODO: add more options here for pipeline, or wrap all options into config
70
  def _create_pipeline(self, pipeline_cls, rank=0, local_rank=0, distributed_init_method=None):
71
+ videosys.initialize(rank=rank, world_size=self.config.num_gpus, init_method=distributed_init_method, seed=42)
72
 
73
  pipeline = pipeline_cls(self.config)
74
  return pipeline
videosys/core/pab_mgr.py CHANGED
@@ -94,6 +94,7 @@ class PABManager:
94
  @staticmethod
95
  def _is_t_in_skip_config(all_timesteps, timestep, config):
96
  is_t_in_skip_config = False
 
97
  for key in config:
98
  if key not in all_timesteps:
99
  continue
 
94
  @staticmethod
95
  def _is_t_in_skip_config(all_timesteps, timestep, config):
96
  is_t_in_skip_config = False
97
+ skip_range = None
98
  for key in config:
99
  if key not in all_timesteps:
100
  continue
videosys/core/pipeline.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from abc import abstractmethod
2
  from dataclasses import dataclass
3
 
@@ -28,6 +29,23 @@ class VideoSysPipeline(DiffusionPipeline):
28
  """
29
  return self.generate(*args, **kwargs)
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  @dataclass
33
  class VideoSysPipelineOutput(BaseOutput):
 
1
+ import inspect
2
  from abc import abstractmethod
3
  from dataclasses import dataclass
4
 
 
29
  """
30
  return self.generate(*args, **kwargs)
31
 
32
+ @classmethod
33
+ def _get_signature_keys(cls, obj):
34
+ parameters = inspect.signature(obj.__init__).parameters
35
+ required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
36
+ optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
37
+ expected_modules = set(required_parameters.keys()) - {"self"}
38
+ # modify: remove the config module from the expected modules
39
+ expected_modules = expected_modules - {"config"}
40
+
41
+ optional_names = list(optional_parameters)
42
+ for name in optional_names:
43
+ if name in cls._optional_components:
44
+ expected_modules.add(name)
45
+ optional_parameters.remove(name)
46
+
47
+ return expected_modules, optional_parameters
48
+
49
 
50
  @dataclass
51
  class VideoSysPipelineOutput(BaseOutput):
videosys/pipelines/cogvideox/pipeline_cogvideox.py CHANGED
@@ -46,30 +46,75 @@ class CogVideoXPABConfig(PABConfig):
46
 
47
 
48
  class CogVideoXConfig:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def __init__(
50
  self,
51
  model_path: str = "THUDM/CogVideoX-2b",
52
- world_size: int = 1,
 
 
 
53
  vae_tiling: bool = True,
 
54
  enable_pab: bool = False,
55
  pab_config=CogVideoXPABConfig(),
56
  ):
57
- # ======= engine ========
58
- self.world_size = world_size
59
-
60
- # ======= pipeline ========
61
  self.pipeline_cls = CogVideoXPipeline
62
-
 
 
 
63
  self.vae_tiling = vae_tiling
64
-
65
- # ======= model ========
66
- self.model_path = model_path
67
  self.enable_pab = enable_pab
68
  self.pab_config = pab_config
69
 
70
 
71
  class CogVideoXPipeline(VideoSysPipeline):
72
- _optional_components = []
73
  model_cpu_offload_seq = "text_encoder->transformer->vae"
74
  _callback_tensor_inputs = [
75
  "latents",
@@ -86,11 +131,13 @@ class CogVideoXPipeline(VideoSysPipeline):
86
  transformer: Optional[CogVideoXTransformer3DModel] = None,
87
  scheduler: Optional[CogVideoXDDIMScheduler] = None,
88
  device: torch.device = torch.device("cuda"),
89
- dtype: torch.dtype = torch.float16,
90
  ):
91
  super().__init__()
92
  self._config = config
93
  self._device = device
 
 
94
  self._dtype = dtype
95
 
96
  if transformer is None:
@@ -99,8 +146,6 @@ class CogVideoXPipeline(VideoSysPipeline):
99
  )
100
  if vae is None:
101
  vae = AutoencoderKLCogVideoX.from_pretrained(config.model_path, subfolder="vae", torch_dtype=self._dtype)
102
- if config.vae_tiling:
103
- vae.enable_tiling(tile_sample_min_height=vae.tile_sample_min_height // 2)
104
  if tokenizer is None:
105
  tokenizer = T5Tokenizer.from_pretrained(config.model_path, subfolder="tokenizer")
106
  if text_encoder is None:
@@ -120,6 +165,14 @@ class CogVideoXPipeline(VideoSysPipeline):
120
  tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
121
  )
122
 
 
 
 
 
 
 
 
 
123
  # pab
124
  if config.enable_pab:
125
  set_pab_manager(config.pab_config)
 
46
 
47
 
48
  class CogVideoXConfig:
49
+ """
50
+ This config is to instantiate a `CogVideoXPipeline` class for video generation.
51
+
52
+ To be specific, this config will be passed to engine by `VideoSysEngine(config)`.
53
+ In the engine, it will be used to instantiate the corresponding pipeline class.
54
+ And the engine will call the `generate` function of the pipeline to generate the video.
55
+ If you want to explore the detail of generation, please refer to the pipeline class below.
56
+
57
+ Args:
58
+ model_path (str):
59
+ A path to the pretrained pipeline. Defaults to "THUDM/CogVideoX-2b".
60
+ num_gpus (int):
61
+ The number of GPUs to use. Defaults to 1.
62
+ cpu_offload (bool):
63
+ Whether to enable CPU offload. Defaults to False.
64
+ vae_tiling (bool):
65
+ Whether to enable tiling for the VAE. Defaults to True.
66
+ enable_pab (bool):
67
+ Whether to enable Pyramid Attention Broadcast. Defaults to False.
68
+ pab_config (CogVideoXPABConfig):
69
+ The configuration for Pyramid Attention Broadcast. Defaults to `CogVideoXPABConfig()`.
70
+
71
+ Examples:
72
+ ```python
73
+ from videosys import CogVideoXConfig, VideoSysEngine
74
+
75
+ # models: "THUDM/CogVideoX-2b" or "THUDM/CogVideoX-5b"
76
+ # change num_gpus for multi-gpu inference
77
+ config = CogVideoXConfig("THUDM/CogVideoX-2b", num_gpus=1)
78
+ engine = VideoSysEngine(config)
79
+
80
+ prompt = "Sunset over the sea."
81
+ # num frames should be <= 49. resolution is fixed to 720p.
82
+ video = engine.generate(
83
+ prompt=prompt,
84
+ guidance_scale=6,
85
+ num_inference_steps=50,
86
+ num_frames=49,
87
+ ).video[0]
88
+ engine.save_video(video, f"./outputs/{prompt}.mp4")
89
+ ```
90
+ """
91
+
92
  def __init__(
93
  self,
94
  model_path: str = "THUDM/CogVideoX-2b",
95
+ # ======= distributed ========
96
+ num_gpus: int = 1,
97
+ # ======= memory =======
98
+ cpu_offload: bool = False,
99
  vae_tiling: bool = True,
100
+ # ======= pab ========
101
  enable_pab: bool = False,
102
  pab_config=CogVideoXPABConfig(),
103
  ):
104
+ self.model_path = model_path
 
 
 
105
  self.pipeline_cls = CogVideoXPipeline
106
+ # ======= distributed ========
107
+ self.num_gpus = num_gpus
108
+ # ======= memory ========
109
+ self.cpu_offload = cpu_offload
110
  self.vae_tiling = vae_tiling
111
+ # ======= pab ========
 
 
112
  self.enable_pab = enable_pab
113
  self.pab_config = pab_config
114
 
115
 
116
  class CogVideoXPipeline(VideoSysPipeline):
117
+ _optional_components = ["tokenizer", "text_encoder", "vae", "transformer", "scheduler"]
118
  model_cpu_offload_seq = "text_encoder->transformer->vae"
119
  _callback_tensor_inputs = [
120
  "latents",
 
131
  transformer: Optional[CogVideoXTransformer3DModel] = None,
132
  scheduler: Optional[CogVideoXDDIMScheduler] = None,
133
  device: torch.device = torch.device("cuda"),
134
+ dtype: torch.dtype = torch.bfloat16,
135
  ):
136
  super().__init__()
137
  self._config = config
138
  self._device = device
139
+ if config.model_path == "THUDM/CogVideoX-2b":
140
+ dtype = torch.float16
141
  self._dtype = dtype
142
 
143
  if transformer is None:
 
146
  )
147
  if vae is None:
148
  vae = AutoencoderKLCogVideoX.from_pretrained(config.model_path, subfolder="vae", torch_dtype=self._dtype)
 
 
149
  if tokenizer is None:
150
  tokenizer = T5Tokenizer.from_pretrained(config.model_path, subfolder="tokenizer")
151
  if text_encoder is None:
 
165
  tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
166
  )
167
 
168
+ # cpu offload
169
+ if config.cpu_offload:
170
+ self.enable_model_cpu_offload()
171
+
172
+ # vae tiling
173
+ if config.vae_tiling:
174
+ vae.enable_tiling()
175
+
176
  # pab
177
  if config.enable_pab:
178
  set_pab_manager(config.pab_config)
videosys/pipelines/latte/pipeline_latte.py CHANGED
@@ -79,10 +79,59 @@ class LattePABConfig(PABConfig):
79
 
80
 
81
  class LatteConfig:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def __init__(
83
  self,
84
  model_path: str = "maxin-cn/Latte-1",
85
- world_size: int = 1,
 
 
86
  enable_vae_temporal_decoder: bool = True,
87
  # ======= scheduler ========
88
  beta_start: float = 0.0001,
@@ -93,22 +142,17 @@ class LatteConfig:
93
  enable_pab: bool = False,
94
  pab_config: PABConfig = LattePABConfig(),
95
  ):
96
- # ======= engine ========
97
- self.world_size = world_size
98
-
99
- # ======= pipeline ========
100
- self.pipeline_cls = LattePipeline
101
-
102
- # ======= model ========
103
  self.model_path = model_path
 
 
 
 
104
  self.enable_vae_temporal_decoder = enable_vae_temporal_decoder
105
-
106
  # ======= scheduler ========
107
  self.beta_start = beta_start
108
  self.beta_end = beta_end
109
  self.beta_schedule = beta_schedule
110
  self.variance_type = variance_type
111
-
112
  # ======= pab ========
113
  self.enable_pab = enable_pab
114
  self.pab_config = pab_config
 
79
 
80
 
81
  class LatteConfig:
82
+ """
83
+ This config is to instantiate a `LattePipeline` class for video generation.
84
+
85
+ To be specific, this config will be passed to engine by `VideoSysEngine(config)`.
86
+ In the engine, it will be used to instantiate the corresponding pipeline class.
87
+ And the engine will call the `generate` function of the pipeline to generate the video.
88
+ If you want to explore the detail of generation, please refer to the pipeline class below.
89
+
90
+ Args:
91
+ model_path (str):
92
+ A path to the pretrained pipeline. Defaults to "maxin-cn/Latte-1".
93
+ num_gpus (int):
94
+ The number of GPUs to use. Defaults to 1.
95
+ enable_vae_temporal_decoder (bool):
96
+ Whether to enable VAE Temporal Decoder. Defaults to True.
97
+ beta_start (float):
98
+ The initial value of beta for DDIM. Defaults to 0.0001.
99
+ beta_end (float):
100
+ The final value of beta for DDIM. Defaults to 0.02.
101
+ beta_schedule (str):
102
+ The schedule of beta for DDIM. Defaults to "linear".
103
+ variance_type (str):
104
+ The type of variance for DDIM. Defaults to "learned_range".
105
+ enable_pab (bool):
106
+ Whether to enable Pyramid Attention Broadcast. Defaults to False.
107
+ pab_config (CogVideoXPABConfig):
108
+ The configuration for Pyramid Attention Broadcast. Defaults to `LattePABConfig()`.
109
+
110
+ Examples:
111
+ ```python
112
+ from videosys import LatteConfig, VideoSysEngine
113
+
114
+ # change num_gpus for multi-gpu inference
115
+ config = LatteConfig("maxin-cn/Latte-1", num_gpus=1)
116
+ engine = VideoSysEngine(config)
117
+
118
+ prompt = "Sunset over the sea."
119
+ # video size is fixed to 16 frames, 512x512.
120
+ video = engine.generate(
121
+ prompt=prompt,
122
+ guidance_scale=7.5,
123
+ num_inference_steps=50,
124
+ ).video[0]
125
+ engine.save_video(video, f"./outputs/{prompt}.mp4")
126
+ ```
127
+ """
128
+
129
  def __init__(
130
  self,
131
  model_path: str = "maxin-cn/Latte-1",
132
+ # ======= distributed =======
133
+ num_gpus: int = 1,
134
+ # ======= vae ========
135
  enable_vae_temporal_decoder: bool = True,
136
  # ======= scheduler ========
137
  beta_start: float = 0.0001,
 
142
  enable_pab: bool = False,
143
  pab_config: PABConfig = LattePABConfig(),
144
  ):
 
 
 
 
 
 
 
145
  self.model_path = model_path
146
+ self.pipeline_cls = LattePipeline
147
+ # ======= distributed =======
148
+ self.num_gpus = num_gpus
149
+ # ======= vae ========
150
  self.enable_vae_temporal_decoder = enable_vae_temporal_decoder
 
151
  # ======= scheduler ========
152
  self.beta_start = beta_start
153
  self.beta_end = beta_end
154
  self.beta_schedule = beta_schedule
155
  self.variance_type = variance_type
 
156
  # ======= pab ========
157
  self.enable_pab = enable_pab
158
  self.pab_config = pab_config
videosys/pipelines/open_sora/pipeline_open_sora.py CHANGED
@@ -69,38 +69,91 @@ class OpenSoraPABConfig(PABConfig):
69
 
70
 
71
  class OpenSoraConfig:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def __init__(
73
  self,
74
- model_path: str = "hpcai-tech/OpenSora-STDiT-v3",
75
- world_size: int = 1,
76
  vae: str = "hpcai-tech/OpenSora-VAE-v1.2",
77
  text_encoder: str = "DeepFloyd/t5-v1_1-xxl",
78
- # ======= scheduler =======
 
 
79
  num_sampling_steps: int = 30,
80
  cfg_scale: float = 7.0,
81
- # ======= vae ========
82
  tiling_size: int = 4,
83
- # ======= pab ========
 
 
84
  enable_pab: bool = False,
85
  pab_config: PABConfig = OpenSoraPABConfig(),
86
  ):
87
- # ======= engine ========
88
- self.world_size = world_size
89
-
90
- # ======= pipeline ========
91
  self.pipeline_cls = OpenSoraPipeline
92
- self.transformer = model_path
93
  self.vae = vae
94
  self.text_encoder = text_encoder
95
-
96
- # ======= scheduler ========
 
97
  self.num_sampling_steps = num_sampling_steps
98
  self.cfg_scale = cfg_scale
99
-
100
- # ======= vae ========
101
  self.tiling_size = tiling_size
102
-
103
- # ======= pab ========
 
104
  self.enable_pab = enable_pab
105
  self.pab_config = pab_config
106
 
@@ -157,16 +210,15 @@ class OpenSoraPipeline(VideoSysPipeline):
157
  tokenizer = AutoTokenizer.from_pretrained(config.text_encoder)
158
  if vae is None:
159
  vae = OpenSoraVAE_V1_2(
160
- from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
161
  micro_frame_size=17,
162
  micro_batch_size=config.tiling_size,
163
  ).to(dtype)
164
  if transformer is None:
165
  transformer = STDiT3_XL_2(
166
- from_pretrained="hpcai-tech/OpenSora-STDiT-v3",
167
  qk_norm=True,
168
- enable_flash_attn=True,
169
- enable_layernorm_kernel=True,
170
  in_channels=vae.out_channels,
171
  caption_channels=text_encoder.config.d_model,
172
  model_max_length=300,
 
69
 
70
 
71
  class OpenSoraConfig:
72
+ """
73
+ This config is to instantiate a `OpenSoraPipeline` class for video generation.
74
+
75
+ To be specific, this config will be passed to engine by `VideoSysEngine(config)`.
76
+ In the engine, it will be used to instantiate the corresponding pipeline class.
77
+ And the engine will call the `generate` function of the pipeline to generate the video.
78
+ If you want to explore the detail of generation, please refer to the pipeline class below.
79
+
80
+ Args:
81
+ transformer (str):
82
+ The transformer model to use. Defaults to "hpcai-tech/OpenSora-STDiT-v3".
83
+ vae (str):
84
+ The VAE model to use. Defaults to "hpcai-tech/OpenSora-VAE-v1.2".
85
+ text_encoder (str):
86
+ The text encoder model to use. Defaults to "DeepFloyd/t5-v1_1-xxl".
87
+ num_gpus (int):
88
+ The number of GPUs to use. Defaults to 1.
89
+ num_sampling_steps (int):
90
+ The number of sampling steps. Defaults to 30.
91
+ cfg_scale (float):
92
+ The configuration scale. Defaults to 7.0.
93
+ tiling_size (int):
94
+ The tiling size. Defaults to 4.
95
+ enable_flash_attn (bool):
96
+ Whether to enable Flash Attention. Defaults to False.
97
+ enable_pab (bool):
98
+ Whether to enable Pyramid Attention Broadcast. Defaults to False.
99
+ pab_config (CogVideoXPABConfig):
100
+ The configuration for Pyramid Attention Broadcast. Defaults to `LattePABConfig()`.
101
+
102
+ Examples:
103
+ ```python
104
+ from videosys import OpenSoraConfig, VideoSysEngine
105
+
106
+ # change num_gpus for multi-gpu inference
107
+ # sampling parameters are defined in the config
108
+ config = OpenSoraConfig(num_sampling_steps=30, cfg_scale=7.0, num_gpus=1)
109
+ engine = VideoSysEngine(config)
110
+
111
+ prompt = "Sunset over the sea."
112
+ # num frames: 2s, 4s, 8s, 16s
113
+ # resolution: 144p, 240p, 360p, 480p, 720p
114
+ # aspect ratio: 9:16, 16:9, 3:4, 4:3, 1:1
115
+ video = engine.generate(
116
+ prompt=prompt,
117
+ resolution="480p",
118
+ aspect_ratio="9:16",
119
+ num_frames="2s",
120
+ ).video[0]
121
+ engine.save_video(video, f"./outputs/{prompt}.mp4")
122
+ ```
123
+ """
124
+
125
  def __init__(
126
  self,
127
+ transformer: str = "hpcai-tech/OpenSora-STDiT-v3",
 
128
  vae: str = "hpcai-tech/OpenSora-VAE-v1.2",
129
  text_encoder: str = "DeepFloyd/t5-v1_1-xxl",
130
+ # ======== distributed ========
131
+ num_gpus: int = 1,
132
+ # ======== scheduler ========
133
  num_sampling_steps: int = 30,
134
  cfg_scale: float = 7.0,
135
+ # ======== vae ========
136
  tiling_size: int = 4,
137
+ # ======== speedup ========
138
+ enable_flash_attn: bool = False,
139
+ # ======== pab ========
140
  enable_pab: bool = False,
141
  pab_config: PABConfig = OpenSoraPABConfig(),
142
  ):
 
 
 
 
143
  self.pipeline_cls = OpenSoraPipeline
144
+ self.transformer = transformer
145
  self.vae = vae
146
  self.text_encoder = text_encoder
147
+ # ======== distributed ========
148
+ self.num_gpus = num_gpus
149
+ # ======== scheduler ========
150
  self.num_sampling_steps = num_sampling_steps
151
  self.cfg_scale = cfg_scale
152
+ # ======== vae ========
 
153
  self.tiling_size = tiling_size
154
+ # ======== speedup ========
155
+ self.enable_flash_attn = enable_flash_attn
156
+ # ======== pab ========
157
  self.enable_pab = enable_pab
158
  self.pab_config = pab_config
159
 
 
210
  tokenizer = AutoTokenizer.from_pretrained(config.text_encoder)
211
  if vae is None:
212
  vae = OpenSoraVAE_V1_2(
213
+ from_pretrained=config.vae,
214
  micro_frame_size=17,
215
  micro_batch_size=config.tiling_size,
216
  ).to(dtype)
217
  if transformer is None:
218
  transformer = STDiT3_XL_2(
219
+ from_pretrained=config.transformer,
220
  qk_norm=True,
221
+ enable_flash_attn=config.enable_flash_attn,
 
222
  in_channels=vae.out_channels,
223
  caption_channels=text_encoder.config.d_model,
224
  model_max_length=300,
videosys/pipelines/open_sora_plan/pipeline_open_sora_plan.py CHANGED
@@ -114,13 +114,61 @@ class OpenSoraPlanPABConfig(PABConfig):
114
 
115
 
116
  class OpenSoraPlanConfig:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  def __init__(
118
  self,
119
- model_path: str = "LanguageBind/Open-Sora-Plan-v1.1.0",
120
- world_size: int = 1,
121
- num_frames: int = 65,
122
  ae: str = "CausalVAEModel_4x8x8",
123
  text_encoder: str = "DeepFloyd/t5-v1_1-xxl",
 
 
 
124
  # ======= vae =======
125
  enable_tiling: bool = True,
126
  tile_overlap_factor: float = 0.25,
@@ -128,24 +176,18 @@ class OpenSoraPlanConfig:
128
  enable_pab: bool = False,
129
  pab_config: PABConfig = OpenSoraPlanPABConfig(),
130
  ):
131
- # ======= engine ========
132
- self.world_size = world_size
133
-
134
- # ======= pipeline ========
135
  self.pipeline_cls = OpenSoraPlanPipeline
136
  self.ae = ae
137
  self.text_encoder = text_encoder
138
-
139
- # ======= model ========
140
- self.model_path = model_path
141
  assert num_frames in [65, 221], "num_frames must be one of [65, 221]"
142
  self.num_frames = num_frames
143
  self.version = f"{num_frames}x512x512"
144
-
 
145
  # ======= vae ========
146
  self.enable_tiling = enable_tiling
147
  self.tile_overlap_factor = tile_overlap_factor
148
-
149
  # ======= pab ========
150
  self.enable_pab = enable_pab
151
  self.pab_config = pab_config
@@ -200,9 +242,9 @@ class OpenSoraPlanPipeline(VideoSysPipeline):
200
  if text_encoder is None:
201
  text_encoder = T5EncoderModel.from_pretrained(config.text_encoder, torch_dtype=torch.float16)
202
  if vae is None:
203
- vae = getae_wrapper(config.ae)(config.model_path, subfolder="vae").to(dtype=dtype)
204
  if transformer is None:
205
- transformer = LatteT2V.from_pretrained(config.model_path, subfolder=config.version, torch_dtype=dtype)
206
  if scheduler is None:
207
  scheduler = PNDMScheduler()
208
 
 
114
 
115
 
116
  class OpenSoraPlanConfig:
117
+ """
118
+ This config is to instantiate a `OpenSoraPlanPipeline` class for video generation.
119
+
120
+ To be specific, this config will be passed to engine by `VideoSysEngine(config)`.
121
+ In the engine, it will be used to instantiate the corresponding pipeline class.
122
+ And the engine will call the `generate` function of the pipeline to generate the video.
123
+ If you want to explore the detail of generation, please refer to the pipeline class below.
124
+
125
+ Args:
126
+ transformer (str):
127
+ The transformer model to use. Defaults to "LanguageBind/Open-Sora-Plan-v1.1.0".
128
+ ae (str):
129
+ The Autoencoder model to use. Defaults to "CausalVAEModel_4x8x8".
130
+ text_encoder (str):
131
+ The text encoder model to use. Defaults to "DeepFloyd/t5-v1_1-xxl".
132
+ num_frames (int):
133
+ The number of frames to generate. Must be one of [65, 221].
134
+ num_gpus (int):
135
+ The number of GPUs to use. Defaults to 1.
136
+ enable_tiling (bool):
137
+ Whether to enable tiling. Defaults to True.
138
+ tile_overlap_factor (float):
139
+ The overlap factor for tiling. Defaults to 0.25.
140
+ enable_pab (bool):
141
+ Whether to enable Pyramid Attention Broadcast. Defaults to False.
142
+ pab_config (CogVideoXPABConfig):
143
+ The configuration for Pyramid Attention Broadcast. Defaults to `LattePABConfig()`.
144
+
145
+ Examples:
146
+ ```python
147
+ from videosys import OpenSoraPlanConfig, VideoSysEngine
148
+
149
+ # num frames: 65 or 221
150
+ # change num_gpus for multi-gpu inference
151
+ config = OpenSoraPlanConfig(num_frames=65, num_gpus=1)
152
+ engine = VideoSysEngine(config)
153
+
154
+ prompt = "Sunset over the sea."
155
+ video = engine.generate(
156
+ prompt=prompt,
157
+ guidance_scale=7.5,
158
+ num_inference_steps=150,
159
+ ).video[0]
160
+ engine.save_video(video, f"./outputs/{prompt}.mp4")
161
+ ```
162
+ """
163
+
164
  def __init__(
165
  self,
166
+ transformer: str = "LanguageBind/Open-Sora-Plan-v1.1.0",
 
 
167
  ae: str = "CausalVAEModel_4x8x8",
168
  text_encoder: str = "DeepFloyd/t5-v1_1-xxl",
169
+ num_frames: int = 65,
170
+ # ======= distributed ========
171
+ num_gpus: int = 1,
172
  # ======= vae =======
173
  enable_tiling: bool = True,
174
  tile_overlap_factor: float = 0.25,
 
176
  enable_pab: bool = False,
177
  pab_config: PABConfig = OpenSoraPlanPABConfig(),
178
  ):
 
 
 
 
179
  self.pipeline_cls = OpenSoraPlanPipeline
180
  self.ae = ae
181
  self.text_encoder = text_encoder
182
+ self.transformer = transformer
 
 
183
  assert num_frames in [65, 221], "num_frames must be one of [65, 221]"
184
  self.num_frames = num_frames
185
  self.version = f"{num_frames}x512x512"
186
+ # ======= distributed ========
187
+ self.num_gpus = num_gpus
188
  # ======= vae ========
189
  self.enable_tiling = enable_tiling
190
  self.tile_overlap_factor = tile_overlap_factor
 
191
  # ======= pab ========
192
  self.enable_pab = enable_pab
193
  self.pab_config = pab_config
 
242
  if text_encoder is None:
243
  text_encoder = T5EncoderModel.from_pretrained(config.text_encoder, torch_dtype=torch.float16)
244
  if vae is None:
245
+ vae = getae_wrapper(config.ae)(config.transformer, subfolder="vae").to(dtype=dtype)
246
  if transformer is None:
247
+ transformer = LatteT2V.from_pretrained(config.transformer, subfolder=config.version, torch_dtype=dtype)
248
  if scheduler is None:
249
  scheduler = PNDMScheduler()
250
 
videosys/utils/utils.py CHANGED
@@ -76,7 +76,7 @@ def save_video(video, output_path, fps):
76
  """
77
  Save a video to disk.
78
  """
 
 
79
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
80
- if dist.get_rank() == 0:
81
- imageio.mimwrite(output_path, video, fps=fps)
82
- dist.barrier()
 
76
  """
77
  Save a video to disk.
78
  """
79
+ if dist.is_initialized() and dist.get_rank() != 0:
80
+ return
81
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
82
+ imageio.mimwrite(output_path, video, fps=fps)