...
| Module | Class | Device | Dtype | Quant | Params | Modules | Config |
|---|---|---|---|---|---|---|---|
| vae | AutoencoderKL | xpu:0 | torch.bfloat16 | None | 83653863 | 243 | FrozenDict({'in_channels': 3, 'out_channels': 3, 'down_block_types': ['DownEncoderBlock2D', 'DownEncoderBlock2D', 'DownEncoderBlock2D', 'DownEncoderBlock2D'], 'up_block_types': ['UpDecoderBlock2D', 'UpDecoderBlock2D', 'UpDecoderBlock2D', 'UpDecoderBlock2D'], 'block_out_channels': [128, 256, 512, 512], 'layers_per_block': 2, 'act_fn': 'silu', 'latent_channels': 4, 'norm_num_groups': 32, 'sample_size': 1024, 'scaling_factor': 0.13025, 'shift_factor': None, 'latents_mean': None, 'latents_std': None, 'force_upcast': False, 'use_quant_conv': True, 'use_post_quant_conv': True, 'mid_block_add_attention': True, '_use_default_values': ['use_post_quant_conv', 'latents_std', 'use_quant_conv', 'shift_factor', 'mid_block_add_attention', 'latents_mean'], '_class_name': 'AutoencoderKL', '_diffusers_version': '0.20.0.dev0', '_name_or_path': '../sdxl-vae/'}) |
| text_encoder | CLIPTextModel | xpu:0 | torch.bfloat16 | None | 123060480 | 152 | CLIPTextConfig { "architectures": [ "CLIPTextModel" ], "attention_dropout": 0.0, "bos_token_id": 0, "dropout": 0.0, "dtype": "float16", "eos_token_id": 2, "hidden_act": "quick_gelu", "hidden_size": 768, "initializer_factor": 1.0, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-05, "max_position_embeddings": 77, "model_type": "clip_text_model", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 1, "projection_dim": 768, "transformers_version": "4.56.2", "vocab_size": 49408 } |
| text_encoder_2 | CLIPTextModelWithProjection | xpu:0 | torch.bfloat16 | None | 694659840 | 393 | CLIPTextConfig { "architectures": [ "CLIPTextModelWithProjection" ], "attention_dropout": 0.0, "bos_token_id": 0, "dropout": 0.0, "dtype": "float16", "eos_token_id": 2, "hidden_act": "gelu", "hidden_size": 1280, "initializer_factor": 1.0, "initializer_range": 0.02, "intermediate_size": 5120, "layer_norm_eps": 1e-05, "max_position_embeddings": 77, "model_type": "clip_text_model", "num_attention_heads": 20, "num_hidden_layers": 32, "pad_token_id": 1, "projection_dim": 1280, "transformers_version": "4.56.2", "vocab_size": 49408 } |
| tokenizer | CLIPTokenizer | None | None | None | 0 | 0 | None |
| tokenizer_2 | CLIPTokenizer | None | None | None | 0 | 0 | None |
| unet | UNet2DConditionModel | xpu:0 | torch.bfloat16 | None | 2567463684 | 1930 | FrozenDict({'sample_size': 128, 'in_channels': 4, 'out_channels': 4, 'center_input_sample': False, 'flip_sin_to_cos': True, 'freq_shift': 0, 'down_block_types': ['DownBlock2D', 'CrossAttnDownBlock2D', 'CrossAttnDownBlock2D'], 'mid_block_type': 'UNetMidBlock2DCrossAttn', 'up_block_types': ['CrossAttnUpBlock2D', 'CrossAttnUpBlock2D', 'UpBlock2D'], 'only_cross_attention': False, 'block_out_channels': [320, 640, 1280], 'layers_per_block': 2, 'downsample_padding': 1, 'mid_block_scale_factor': 1, 'dropout': 0.0, 'act_fn': 'silu', 'norm_num_groups': 32, 'norm_eps': 1e-05, 'cross_attention_dim': 2048, 'transformer_layers_per_block': [1, 2, 10], 'reverse_transformer_layers_per_block': None, 'encoder_hid_dim': None, 'encoder_hid_dim_type': None, 'attention_head_dim': [5, 10, 20], 'num_attention_heads': None, 'dual_cross_attention': False, 'use_linear_projection': True, 'class_embed_type': None, 'addition_embed_type': 'text_time', 'addition_time_embed_dim': 256, 'num_class_embeds': None, 'upcast_attention': None, 'resnet_time_scale_shift': 'default', 'resnet_skip_time_act': False, 'resnet_out_scale_factor': 1.0, 'time_embedding_type': 'positional', 'time_embedding_dim': None, 'time_embedding_act_fn': None, 'timestep_post_act': None, 'time_cond_proj_dim': None, 'conv_in_kernel': 3, 'conv_out_kernel': 3, 'projection_class_embeddings_input_dim': 2816, 'attention_type': 'default', 'class_embeddings_concat': False, 'mid_block_only_cross_attention': None, 'cross_attention_norm': None, 'addition_embed_type_num_heads': 64, '_use_default_values': ['attention_type', 'dropout', 'reverse_transformer_layers_per_block'], '_class_name': 'UNet2DConditionModel', '_diffusers_version': '0.19.0.dev0'}) |
| scheduler | EulerDiscreteScheduler | None | None | None | 0 | 0 | FrozenDict({'num_train_timesteps': 1000, 'beta_start': 0.00085, 'beta_end': 0.012, 'beta_schedule': 'scaled_linear', 'trained_betas': None, 'prediction_type': 'epsilon', 'interpolation_type': 'linear', 'use_karras_sigmas': False, 'use_exponential_sigmas': False, 'use_beta_sigmas': False, 'sigma_min': None, 'sigma_max': None, 'timestep_spacing': 'leading', 'timestep_type': 'discrete', 'steps_offset': 1, 'rescale_betas_zero_snr': False, 'final_sigmas_type': 'zero', '_use_default_values': ['final_sigmas_type', 'use_beta_sigmas', 'use_exponential_sigmas', 'sigma_min', 'rescale_betas_zero_snr', 'timestep_type', 'sigma_max'], '_class_name': 'EulerDiscreteScheduler', '_diffusers_version': '0.19.0.dev0', 'clip_sample': False, 'sample_max_value': 1.0, 'set_alpha_to_one': False, 'skip_prk_steps': True}) |
| image_encoder | NoneType | None | None | None | 0 | 0 | None |
| feature_extractor | NoneType | None | None | None | 0 | 0 | None |
| force_zeros_for_empty_prompt | bool | None | None | None | 0 | 0 | None |