Upload modeling_florence2.py
Browse files
Florence2/modeling_florence2.py
CHANGED
|
@@ -29,6 +29,7 @@ from einops import rearrange
|
|
| 29 |
from timm.layers import DropPath, trunc_normal_
|
| 30 |
|
| 31 |
from transformers.modeling_utils import PreTrainedModel
|
|
|
|
| 32 |
from transformers.utils import (
|
| 33 |
ModelOutput,
|
| 34 |
add_start_docstrings,
|
|
@@ -1430,7 +1431,7 @@ class Florence2DecoderLayer(nn.Module):
|
|
| 1430 |
|
| 1431 |
|
| 1432 |
|
| 1433 |
-
class Florence2LanguagePreTrainedModel(PreTrainedModel):
|
| 1434 |
config_class = Florence2LanguageConfig
|
| 1435 |
base_model_prefix = "model"
|
| 1436 |
supports_gradient_checkpointing = True
|
|
@@ -1462,7 +1463,7 @@ class Florence2LanguagePreTrainedModel(PreTrainedModel):
|
|
| 1462 |
return dummy_inputs
|
| 1463 |
|
| 1464 |
|
| 1465 |
-
class Florence2Encoder(Florence2LanguagePreTrainedModel):
|
| 1466 |
"""
|
| 1467 |
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
|
| 1468 |
[`Florence2EncoderLayer`].
|
|
@@ -1650,7 +1651,7 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
|
|
| 1650 |
)
|
| 1651 |
|
| 1652 |
|
| 1653 |
-
class Florence2Decoder(Florence2LanguagePreTrainedModel):
|
| 1654 |
"""
|
| 1655 |
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`Florence2DecoderLayer`]
|
| 1656 |
|
|
@@ -1937,7 +1938,7 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
|
|
| 1937 |
)
|
| 1938 |
|
| 1939 |
|
| 1940 |
-
class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
|
| 1941 |
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
| 1942 |
|
| 1943 |
def __init__(self, config: Florence2LanguageConfig):
|
|
@@ -2058,7 +2059,6 @@ class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
|
|
| 2058 |
encoder_attentions=encoder_outputs.attentions,
|
| 2059 |
)
|
| 2060 |
|
| 2061 |
-
from transformers.generation.utils import GenerationMixin
|
| 2062 |
|
| 2063 |
class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel, GenerationMixin):
|
| 2064 |
base_model_prefix = "model"
|
|
@@ -2327,7 +2327,7 @@ FLORENCE2_START_DOCSTRING = r"""
|
|
| 2327 |
"The bare Florence-2 Model outputting raw hidden-states without any specific head on top.",
|
| 2328 |
FLORENCE2_START_DOCSTRING,
|
| 2329 |
)
|
| 2330 |
-
class Florence2PreTrainedModel(PreTrainedModel):
|
| 2331 |
config_class = Florence2Config
|
| 2332 |
base_model_prefix = "model"
|
| 2333 |
supports_gradient_checkpointing = True
|
|
@@ -2530,7 +2530,7 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
|
|
| 2530 |
"""The FLORENCE2 model which consists of a vision backbone and a language model.""",
|
| 2531 |
FLORENCE2_START_DOCSTRING,
|
| 2532 |
)
|
| 2533 |
-
class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
|
| 2534 |
def __init__(self, config: Florence2Config):
|
| 2535 |
super().__init__(config)
|
| 2536 |
assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
|
|
|
|
| 29 |
from timm.layers import DropPath, trunc_normal_
|
| 30 |
|
| 31 |
from transformers.modeling_utils import PreTrainedModel
|
| 32 |
+
from transformers.generation import GenerationMixin
|
| 33 |
from transformers.utils import (
|
| 34 |
ModelOutput,
|
| 35 |
add_start_docstrings,
|
|
|
|
| 1431 |
|
| 1432 |
|
| 1433 |
|
| 1434 |
+
class Florence2LanguagePreTrainedModel(PreTrainedModel, GenerationMixin ):
|
| 1435 |
config_class = Florence2LanguageConfig
|
| 1436 |
base_model_prefix = "model"
|
| 1437 |
supports_gradient_checkpointing = True
|
|
|
|
| 1463 |
return dummy_inputs
|
| 1464 |
|
| 1465 |
|
| 1466 |
+
class Florence2Encoder(Florence2LanguagePreTrainedModel, GenerationMixin):
|
| 1467 |
"""
|
| 1468 |
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
|
| 1469 |
[`Florence2EncoderLayer`].
|
|
|
|
| 1651 |
)
|
| 1652 |
|
| 1653 |
|
| 1654 |
+
class Florence2Decoder(Florence2LanguagePreTrainedModel, GenerationMixin):
|
| 1655 |
"""
|
| 1656 |
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`Florence2DecoderLayer`]
|
| 1657 |
|
|
|
|
| 1938 |
)
|
| 1939 |
|
| 1940 |
|
| 1941 |
+
class Florence2LanguageModel(Florence2LanguagePreTrainedModel, GenerationMixin):
|
| 1942 |
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
| 1943 |
|
| 1944 |
def __init__(self, config: Florence2LanguageConfig):
|
|
|
|
| 2059 |
encoder_attentions=encoder_outputs.attentions,
|
| 2060 |
)
|
| 2061 |
|
|
|
|
| 2062 |
|
| 2063 |
class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel, GenerationMixin):
|
| 2064 |
base_model_prefix = "model"
|
|
|
|
| 2327 |
"The bare Florence-2 Model outputting raw hidden-states without any specific head on top.",
|
| 2328 |
FLORENCE2_START_DOCSTRING,
|
| 2329 |
)
|
| 2330 |
+
class Florence2PreTrainedModel(PreTrainedModel, GenerationMixin):
|
| 2331 |
config_class = Florence2Config
|
| 2332 |
base_model_prefix = "model"
|
| 2333 |
supports_gradient_checkpointing = True
|
|
|
|
| 2530 |
"""The FLORENCE2 model which consists of a vision backbone and a language model.""",
|
| 2531 |
FLORENCE2_START_DOCSTRING,
|
| 2532 |
)
|
| 2533 |
+
class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixin):
|
| 2534 |
def __init__(self, config: Florence2Config):
|
| 2535 |
super().__init__(config)
|
| 2536 |
assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
|