DeepBeepMeep commited on
Commit
8646b50
·
verified ·
1 Parent(s): c5bb338

Upload modeling_florence2.py

Browse files
Files changed (1) hide show
  1. Florence2/modeling_florence2.py +7 -7
Florence2/modeling_florence2.py CHANGED
@@ -29,6 +29,7 @@ from einops import rearrange
29
  from timm.layers import DropPath, trunc_normal_
30
 
31
  from transformers.modeling_utils import PreTrainedModel
 
32
  from transformers.utils import (
33
  ModelOutput,
34
  add_start_docstrings,
@@ -1430,7 +1431,7 @@ class Florence2DecoderLayer(nn.Module):
1430
 
1431
 
1432
 
1433
- class Florence2LanguagePreTrainedModel(PreTrainedModel):
1434
  config_class = Florence2LanguageConfig
1435
  base_model_prefix = "model"
1436
  supports_gradient_checkpointing = True
@@ -1462,7 +1463,7 @@ class Florence2LanguagePreTrainedModel(PreTrainedModel):
1462
  return dummy_inputs
1463
 
1464
 
1465
- class Florence2Encoder(Florence2LanguagePreTrainedModel):
1466
  """
1467
  Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
1468
  [`Florence2EncoderLayer`].
@@ -1650,7 +1651,7 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
1650
  )
1651
 
1652
 
1653
- class Florence2Decoder(Florence2LanguagePreTrainedModel):
1654
  """
1655
  Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`Florence2DecoderLayer`]
1656
 
@@ -1937,7 +1938,7 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
1937
  )
1938
 
1939
 
1940
- class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
1941
  _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
1942
 
1943
  def __init__(self, config: Florence2LanguageConfig):
@@ -2058,7 +2059,6 @@ class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
2058
  encoder_attentions=encoder_outputs.attentions,
2059
  )
2060
 
2061
- from transformers.generation.utils import GenerationMixin
2062
 
2063
  class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel, GenerationMixin):
2064
  base_model_prefix = "model"
@@ -2327,7 +2327,7 @@ FLORENCE2_START_DOCSTRING = r"""
2327
  "The bare Florence-2 Model outputting raw hidden-states without any specific head on top.",
2328
  FLORENCE2_START_DOCSTRING,
2329
  )
2330
- class Florence2PreTrainedModel(PreTrainedModel):
2331
  config_class = Florence2Config
2332
  base_model_prefix = "model"
2333
  supports_gradient_checkpointing = True
@@ -2530,7 +2530,7 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
2530
  """The FLORENCE2 model which consists of a vision backbone and a language model.""",
2531
  FLORENCE2_START_DOCSTRING,
2532
  )
2533
- class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
2534
  def __init__(self, config: Florence2Config):
2535
  super().__init__(config)
2536
  assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
 
29
  from timm.layers import DropPath, trunc_normal_
30
 
31
  from transformers.modeling_utils import PreTrainedModel
32
+ from transformers.generation import GenerationMixin
33
  from transformers.utils import (
34
  ModelOutput,
35
  add_start_docstrings,
 
1431
 
1432
 
1433
 
1434
+ class Florence2LanguagePreTrainedModel(PreTrainedModel, GenerationMixin ):
1435
  config_class = Florence2LanguageConfig
1436
  base_model_prefix = "model"
1437
  supports_gradient_checkpointing = True
 
1463
  return dummy_inputs
1464
 
1465
 
1466
+ class Florence2Encoder(Florence2LanguagePreTrainedModel, GenerationMixin):
1467
  """
1468
  Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
1469
  [`Florence2EncoderLayer`].
 
1651
  )
1652
 
1653
 
1654
+ class Florence2Decoder(Florence2LanguagePreTrainedModel, GenerationMixin):
1655
  """
1656
  Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`Florence2DecoderLayer`]
1657
 
 
1938
  )
1939
 
1940
 
1941
+ class Florence2LanguageModel(Florence2LanguagePreTrainedModel, GenerationMixin):
1942
  _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
1943
 
1944
  def __init__(self, config: Florence2LanguageConfig):
 
2059
  encoder_attentions=encoder_outputs.attentions,
2060
  )
2061
 
 
2062
 
2063
  class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel, GenerationMixin):
2064
  base_model_prefix = "model"
 
2327
  "The bare Florence-2 Model outputting raw hidden-states without any specific head on top.",
2328
  FLORENCE2_START_DOCSTRING,
2329
  )
2330
+ class Florence2PreTrainedModel(PreTrainedModel, GenerationMixin):
2331
  config_class = Florence2Config
2332
  base_model_prefix = "model"
2333
  supports_gradient_checkpointing = True
 
2530
  """The FLORENCE2 model which consists of a vision backbone and a language model.""",
2531
  FLORENCE2_START_DOCSTRING,
2532
  )
2533
+ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixin):
2534
  def __init__(self, config: Florence2Config):
2535
  super().__init__(config)
2536
  assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'