replit-code-v1-3b-demo

Paused

App Files Files Community

Canstralian commited on Jan 27

Commit

88a170b

verified ·

1 Parent(s): d1e65a6

Upload 18 files

Browse files

Files changed (18) hide show

.dockerignore +3 -0
LICENSE.txt +201 -0
README.md +3 -34
cog-replit-code-v1-3b-main/.dockerignore +3 -0
cog-replit-code-v1-3b-main/LICENSE.txt +201 -0
cog-replit-code-v1-3b-main/README.md +5 -0
cog-replit-code-v1-3b-main/cog.yaml +15 -0
cog-replit-code-v1-3b-main/predict.py +202 -0
cog-replit-code-v1-3b-main/requirements.txt +6 -0
cog-replit-code-v1-3b-main/scripts/download_and_prepare_model.py +107 -0
cog-replit-code-v1-3b-main/scripts/tensorize_model.py +91 -0
cog-replit-code-v1-3b-main/subclass.py +284 -0
cog.yaml +15 -0
predict.py +202 -0
requirements.txt +6 -4
scripts/download_and_prepare_model.py +107 -0
scripts/tensorize_model.py +91 -0
subclass.py +284 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,3 @@

+model/*.bin
+model/*.tensors
+notebooks

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2022, Replicate, Inc.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,36 +1,5 @@
----
-sdk: streamlit
-sdk_version: 1.41.1
-license: mit
-title: replit-code-v1-3b-fine-tuned
-emoji: 📚
-colorFrom: yellow
-colorTo: blue
----
-# Replit Code V1 3B Fine-Tuned Model
-This model is a fine-tuned version of the Replit Code model, designed to assist with generating Python code from pseudocode and offering AI-driven suggestions for code optimization. It helps streamline machine learning workflows and automates coding tasks with the power of AI.
-## Features:
-- **Text Generation:** Generate human-like code based on descriptions.
-- **Pseudocode to Python:** Convert pseudocode into optimized Python code.
-- **Code Optimization:** Provide suggestions for optimizing Python code.
-- **ML Debugging:** Analyze and provide feedback for machine learning pipeline errors.
-## License:
-This model is licensed under the MIT License. Feel free to use and adapt it according to the terms of the license.
-## Tags:
-`machine learning`, `code generation`, `python`, `AI`, `code optimization`, `streamlit`, `transformers`
-## Model Details:
-- **Base Model:** Replit Code (fine-tuned)
-- **Purpose:** AI assistant for improving Python code and machine learning pipelines.
-## Usage:
-Interact with this model through the provided interface in Streamlit. Input pseudocode or Python code, and the model will assist with text generation, optimization, or debugging.
----
-Powered by [Replit LLM](https://replit.com) and [Hugging Face](https://huggingface.co).

+# replit-code-v1-3b
+[![Replicate](https://replicate.com/replicate/replit-code-v1-3b/badge)](https://replicate.com/replicate/replit-code-v1-3b)
+A [Cog](https://cog.run) implementation of Replit's [replit-code-v1-3b](https://huggingface.co/replit/replit-code-v1-3b) Large Language Model

cog-replit-code-v1-3b-main/.dockerignore ADDED Viewed

	@@ -0,0 +1,3 @@

+model/*.bin
+model/*.tensors
+notebooks

cog-replit-code-v1-3b-main/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2022, Replicate, Inc.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

cog-replit-code-v1-3b-main/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# replit-code-v1-3b
+[![Replicate](https://replicate.com/replicate/replit-code-v1-3b/badge)](https://replicate.com/replicate/replit-code-v1-3b)
+A [Cog](https://cog.run) implementation of Replit's [replit-code-v1-3b](https://huggingface.co/replit/replit-code-v1-3b) Large Language Model

cog-replit-code-v1-3b-main/cog.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+build:
+  gpu: true
+  cuda: "11.7"
+  python_version: "3.10"
+  python_requirements: requirements.txt
+  # commands run after the environment is setup
+  run:
+    - pip install flash-attn==0.2.8
+    - pip install triton==2.0.0.dev20221202
+    - pip install tensorizer==1.1.0
+    - echo 'deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main' | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+    - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
+    - apt-get update && apt-get install google-cloud-cli
+predict: "predict.py:Predictor"

cog-replit-code-v1-3b-main/predict.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import time
+from typing import Optional
+import subprocess
+import torch
+import os
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
+from tensorizer import TensorDeserializer
+from tensorizer.utils import no_init_or_tensor
+from collections import OrderedDict
+from cog import BasePredictor, ConcatenateIterator, Input, Path
+# from config import DEFAULT_MODEL_NAME, DEFAULT_CONFIG_PATH, load_tokenizer, load_tensorizer
+from subclass import YieldingReplitCode
+# Weights are either local or in a cloud bucket.
+# For development, point to a local path on disk.
+# This is the path from which we pull weights when there's no COG_WEIGHTS environment variable (COG_WEIGHTS is a thing for trainable models)
+# TENSORIZER_WEIGHTS_PATH = "model/model.tensors"
+TENSORIZER_WEIGHTS_PATH = "gs://replicate-weights/replit-code-v1-3b/model.tensors"
+# Set this to a GCP URL when pushing the model
+# TENSORIZER_WEIGHTS_PATH = None
+DEFAULT_CONFIG_PATH = "model/"
+TOKENIZER_PATH = "model/"
+def maybe_download(path):
+    if path.startswith("gs://"):
+        st = time.time()
+        output_path = "/tmp/weights.tensors"
+        subprocess.check_call(["gcloud", "storage", "cp", path, output_path])
+        print(f"weights downloaded in {time.time() - st}")
+        return output_path
+    return path
+class Predictor(BasePredictor):
+    def setup(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # set TOKENIZERS_PARALLELISM to false to avoid a warning
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        self.model = self.load_tensorizer(
+            weights=maybe_download(TENSORIZER_WEIGHTS_PATH), plaid_mode=True, cls=YieldingReplitCode, config_path=DEFAULT_CONFIG_PATH,
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
+    def load_tensorizer(self, weights, plaid_mode, cls, config_path):
+        st = time.time()
+        print(f"deserializing weights from {weights}")
+        config = AutoConfig.from_pretrained(config_path, trust_remote_code=True)
+        config.attn_config['attn_impl'] = 'triton'
+        # with no_init_or_tensor():
+        #     model = YieldingReplitCode.from_pretrained('./model/', config=config, trust_remote_code=True)
+        model = no_init_or_tensor(
+            lambda: cls.from_pretrained(
+                None, config=config, state_dict=OrderedDict(), trust_remote_code=True,
+            )
+        )
+        deserialized = TensorDeserializer(weights, plaid_mode=True)
+        deserialized.load_into_module(model)
+        try:
+          model = model.to(dtype=torch.bfloat16)
+        except:
+            pass
+        print(f"weights loaded in {time.time() - st}")
+        return model
+    def predict(
+        self,
+        prompt: str = Input(description=f"Text prompt"),
+        max_length: int = Input(
+            description="Maximum number of tokens to generate. A word is generally 2-3 tokens",
+            ge=1,
+            default=500,
+        ),
+        temperature: float = Input(
+            description="Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic, 0.75 is a good starting value.",
+            ge=0.01,
+            le=5,
+            default=0.75,
+        ),
+        top_p: float = Input(
+            description="When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens",
+            ge=0.01,
+            le=1.0,
+            default=1.0,
+        ),
+        repetition_penalty: float = Input(
+            description="Penalty for repeated words in generated text; 1 is no penalty, values greater than 1 discourage repetition, less than 1 encourage it.",
+            ge=0.01,
+            le=5,
+            default=1,
+        ),
+        length_penalty: float = Input(
+            description="Increasing the length_penalty parameter above 1.0 will cause the model to favor longer sequences, while decreasing it below 1.0 will cause the model to favor shorter sequences.",
+            ge=0.01,
+            le=5,
+            default=1,
+        ),
+        no_repeat_ngram_size: int = Input(
+            description="If set to int > 0, all ngrams of size no_repeat_ngram_size can only occur once.",
+            ge=0,
+            default=0,
+        ),
+        stop_sequence: str = Input(
+            description="Generation will hault if this token is produced. Currently, only single token stop sequences are support and it is recommended to use `###` as the stop sequence if you want to control generation termination.",
+            default=None,
+        ),
+        seed: int = Input(
+            description="Set seed for reproducible outputs. Set to -1 for random seed.",
+            ge=-1,
+            default=-1,
+        ),
+        debug: bool = Input(
+            description="provide debugging output in logs", default=False
+        ),
+    ) -> ConcatenateIterator[str]:
+        input = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
+        # set torch seed
+        if seed == -1:
+            torch.seed()
+        else:
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed(seed)
+        with torch.inference_mode():
+            first_token_yielded = False
+            prev_ids = []
+            for output in self.model.generate(
+                input,
+                max_length=max_length,
+                do_sample=True,
+                temperature=temperature,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                length_penalty=length_penalty,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+            ):
+                cur_id = output.item()
+                # in order to properly handle spaces, we need to do our own tokenizing. Fun!
+                # we're building up a buffer of sub-word / punctuation tokens until we hit a space, and then yielding whole words + punctuation.
+                cur_token = self.tokenizer.convert_ids_to_tokens(cur_id)
+                # skip initial newline, which this almost always yields. hack - newline id = 13.
+                if not first_token_yielded and not prev_ids and cur_id == 187:
+                    continue
+                # Ġ means a space, means we yield previous tokens
+                if cur_token.startswith("Ġ"):  # this is not a standard G.
+                    # first token
+                    if not prev_ids:
+                        prev_ids = [cur_id]
+                        continue
+                    # there are tokens to yield
+                    else:
+                        token = self.tokenizer.decode(prev_ids, clean_up_tokenization_spaces=False)
+                        prev_ids = [cur_id]
+                        if not first_token_yielded:
+                            # no leading space for first token
+                            token = token.strip()
+                            first_token_yielded = True
+                        yield token
+                                # End token
+                elif cur_token == "<|endoftext|>":
+                    break
+                elif stop_sequence and cur_token == stop_sequence:
+                    break
+                else:
+                    prev_ids.append(cur_id)
+                    continue
+            # remove any special tokens such as </s>
+            token = self.tokenizer.decode(prev_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+            if not first_token_yielded:
+                # no leading space for first token
+                token = token.strip()
+                first_token_yielded = True
+            yield token
+        if debug:
+            print(f"cur memory: {torch.cuda.memory_allocated()}")
+            print(f"max allocated: {torch.cuda.max_memory_allocated()}")
+            print(f"peak memory: {torch.cuda.max_memory_reserved()}")

cog-replit-code-v1-3b-main/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+einops==0.6.1
+sentencepiece==0.1.99
+torch==2.0.1
+transformers==4.29.2
+# flash-attn==0.2.8
+# triton==2.0.0.dev20221202

cog-replit-code-v1-3b-main/scripts/download_and_prepare_model.py ADDED Viewed

	@@ -0,0 +1,107 @@

+#!/usr/bin/env python
+import os
+import shutil
+import argparse
+import logging
+import sys
+import torch
+from distutils.dir_util import copy_tree
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from huggingface_hub import snapshot_download, login
+from tensorizer import TensorSerializer
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+from tensorize_model import tensorize_model
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, stream=sys.stdout)
+def download_model_from_hf_hub(
+        model_name: str,
+        model_path: str,
+        rm_existing_model: bool = True,
+    ) -> dict:
+    """
+    This function downloads a model from the Hugging Face Hub and saves it locally.
+    It also saves the tokenizer in a separate location so that it can be easely included in a docker Image
+    without including the model weights.
+    Args:
+        model_name (str): Name of model on hugging face hub
+        path (str): Local path where model is saved
+        rm_existing_model (bool, optional): Whether to remove the existing model or not. Defaults to False.
+    Returns:
+        dict: Dictionary containing the model name and path
+    """
+    # model_weights_path = os.path.join(os.getcwd(), "model_weights/torch_weights")
+    # model_path = os.path.join(model_weights_path, model_name)
+    if rm_existing_model:
+        logger.info(f"Removing existing model at {model_path}")
+        if os.path.exists(model_path):
+            shutil.rmtree(model_path)
+    # setup temporary directory
+    with TemporaryDirectory() as tmpdir:
+        logger.info(f"Downloading {model_name} weights to temp...")
+        snapshot_dir = snapshot_download(
+            repo_id=model_name,
+            cache_dir=tmpdir,
+            allow_patterns=["*.bin", "*.json", "*.md", "*.model", "*.py"],
+        )
+        # copy snapshot to model dir
+        logger.info(f"Copying weights to {model_path}...")
+        copy_tree(snapshot_dir, str(model_path))
+    return {"model_name": model_name, "model_path": model_path}
+def download_hf_model_and_copy_tokenizer(
+        model_name: str,
+        model_path: str,
+        tokenizer_path: str,
+        rm_existing_model: bool = True,
+):
+    model_info = download_model_from_hf_hub(model_name, model_path)
+    if tokenizer_path:
+        # Move tokenizer to separate location
+        logging.info(f"Copying tokenizer and model config to {tokenizer_path}...")
+        tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left")
+        tokenizer.save_pretrained(tokenizer_path)
+        # Set the source and destination file paths
+        config_path = os.path.join(model_path, "config.json")
+        # Use the shutil.copy() function to copy the file to the destination directory
+        shutil.copy(config_path, tokenizer_path)
+    return model_info
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str)
+    parser.add_argument("--model_path", type=str)
+    parser.add_argument("--tokenizer_path", type=str, default=None)
+    parser.add_argument("--hf_token", type=str, default=None)
+    parser.add_argument("--tensorize", action="store_true", default=False)
+    parser.add_argument("--dtype", type=str, default="fp32")
+    args = parser.parse_args()
+    if args.hf_token is not None:
+        login(token=args.hf_token)
+    # download_hf_model_and_copy_tokenizer(args.model_name, model_path=args.model_path, tokenizer_path=args.tokenizer_path)
+    tensorizer_path = os.path.join(args.model_path, "model.tensors")
+    if args.tensorize:
+        model = tensorize_model(args.model_name, model_path=args.model_path, dtype=args.dtype, tensorizer_path=tensorizer_path)

cog-replit-code-v1-3b-main/scripts/tensorize_model.py ADDED Viewed

	@@ -0,0 +1,91 @@

+#!/usr/bin/env python
+import torch
+import os
+import argparse
+import logging
+import sys
+from tensorizer import TensorSerializer
+from transformers import AutoModelForCausalLM, AutoConfig
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, stream=sys.stdout)
+def tensorize_model(
+        model_name: str,
+        model_path: str,
+        tensorizer_path: str,
+        dtype: str = "fp32",
+) -> dict:
+    """
+    Create a tensorized version of model weights. If fp16 or bf16 is True,
+    the model will be converted to fp16 or bf16.
+    If `model_path` is None weights will be saved in `./model_weights/torch_weights/model_name`.
+    If `tensorizer_path` is None weights will be saved in `./model_weights/tensorizer_weights/model_name/dtype_str`.
+    Args:
+        model_name (str): Name of model on hugging face hub
+        model_path (str, optional): Local path where model weights are saved.
+        tensorizer_path (str, optional): Local path where tensorizer weights are saved.
+        path (str): Local path where tensorized model weights are saved
+        dtype (str): One of `"fp32"`, `"fp16"`, and `"bf16"`. Defaults to `"fp32"`.
+    Returns:
+        dict: Dictionary containing the tensorized model path and dtype.
+    """
+    if dtype == 'fp32' or dtype is None:
+        torch_dtype = torch.float32
+    elif dtype == 'bf16':
+        torch_dtype = torch.bfloat16
+    elif dtype == 'fp16':
+        torch_dtype = torch.float16
+    logger.info(f"Loading {model_name} in {dtype} from {model_path}...")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path, trust_remote_code=True,
+    ).to('cuda:0')
+    logger.info(f"Tensorizing model {model_name} in {dtype} and writing tensors to {tensorizer_path}...")
+    serializer = TensorSerializer(tensorizer_path)
+    serializer.write_module(model)
+    serializer.close()
+    # Write config to tensorized model weights directory
+    # dir_path = os.path.dirname(tensorizer_path)
+    # config_path = os.path.join(dir_path, 'config.json')
+    model_config = model.config
+    model_config.save_pretrained(model_name)
+    logger.info(f"Tensorized model {model_name} in {dtype} and wrote tensors to {tensorizer_path} and config to {config_path}...")
+    return {"tensorized_weights_path": tensorizer_path, "dtype": dtype}
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=(
+        "A simple script for tensorizing a torch model."
+        )
+    )
+    parser.add_argument("--model_name", type=str)
+    parser.add_argument("--model_path", type=str, default=None)
+    parser.add_argument("--tensorizer_path", type=str, default=None)
+    parser.add_argument("--dtype", type=str, default="fp32")
+    args = parser.parse_args()
+    model_info = tensorize_model(
+        args.model_name,
+        model_path=args.model_path,
+        tensorizer_path=args.tensorizer_path,
+        dtype=args.dtype
+    )

cog-replit-code-v1-3b-main/subclass.py ADDED Viewed

	@@ -0,0 +1,284 @@

+"""sampling code pulled from Transformers & slightly modified to stream tokens"""
+import warnings
+from typing import List, Optional,  Union
+import torch
+import torch.distributed as dist
+from torch import nn
+from transformers.generation.logits_process import  LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria
+from transformers.generation.utils import SampleOutput, SampleDecoderOnlyOutput, SampleEncoderDecoderOutput
+# from transformers import AutoModelForCausalLM
+from model.modeling_mpt import MPTForCausalLM
+class YieldingReplitCode(MPTForCausalLM):
+    """Overriding sample to yield tokens"""
+    def sample(
+            self,
+            input_ids: torch.LongTensor,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            logits_warper: Optional[LogitsProcessorList] = None,
+            max_length: Optional[int] = None,
+            pad_token_id: Optional[int] = None,
+            eos_token_id: Optional[Union[int, List[int]]] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            output_scores: Optional[bool] = None,
+            return_dict_in_generate: Optional[bool] = None,
+            synced_gpus: Optional[bool] = False,
+            **model_kwargs,
+        ) -> Union[SampleOutput, torch.LongTensor]:
+            r"""
+            Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
+            can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+            <Tip warning={true}>
+            In most cases, you do not need to call [`~generation.GenerationMixin.sample`] directly. Use generate() instead.
+            For an overview of generation strategies and code examples, check the [following
+            guide](./generation_strategies).
+            </Tip>
+            Parameters:
+                input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                    The sequence used as a prompt for the generation.
+                logits_processor (`LogitsProcessorList`, *optional*):
+                    An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                    used to modify the prediction scores of the language modeling head applied at each generation step.
+                stopping_criteria (`StoppingCriteriaList`, *optional*):
+                    An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                    used to tell if the generation loop should stop.
+                logits_warper (`LogitsProcessorList`, *optional*):
+                    An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+                    to warp the prediction score distribution of the language modeling head applied before multinomial
+                    sampling at each generation step.
+                max_length (`int`, *optional*, defaults to 20):
+                    **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+                    tokens. The maximum length of the sequence to be generated.
+                pad_token_id (`int`, *optional*):
+                    The id of the *padding* token.
+                eos_token_id (`int`, *optional*):
+                    The id of the *end-of-sequence* token.
+                output_attentions (`bool`, *optional*, defaults to `False`):
+                    Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                    returned tensors for more details.
+                output_hidden_states (`bool`, *optional*, defaults to `False`):
+                    Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                    for more details.
+                output_scores (`bool`, *optional*, defaults to `False`):
+                    Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+                return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                    Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+                synced_gpus (`bool`, *optional*, defaults to `False`):
+                    Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                model_kwargs:
+                    Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                    an encoder-decoder model the kwargs should include `encoder_outputs`.
+            Return:
+                [`~generation.SampleDecoderOnlyOutput`], [`~generation.SampleEncoderDecoderOutput`] or `torch.LongTensor`:
+                A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+                [`~generation.SampleDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+                `return_dict_in_generate=True` or a [`~generation.SampleEncoderDecoderOutput`] if
+                `model.config.is_encoder_decoder=True`.
+            Examples:
+            ```python
+            >>> from transformers import (
+            ...     AutoTokenizer,
+            ...     AutoModelForCausalLM,
+            ...     LogitsProcessorList,
+            ...     MinLengthLogitsProcessor,
+            ...     TopKLogitsWarper,
+            ...     TemperatureLogitsWarper,
+            ...     StoppingCriteriaList,
+            ...     MaxLengthCriteria,
+            ... )
+            >>> import torch
+            >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+            >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+            >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
+            >>> model.config.pad_token_id = model.config.eos_token_id
+            >>> model.generation_config.pad_token_id = model.config.eos_token_id
+            >>> input_prompt = "Today is a beautiful day, and"
+            >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+            >>> # instantiate logits processors
+            >>> logits_processor = LogitsProcessorList(
+            ...     [
+            ...         MinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
+            ...     ]
+            ... )
+            >>> # instantiate logits processors
+            >>> logits_warper = LogitsProcessorList(
+            ...     [
+            ...         TopKLogitsWarper(50),
+            ...         TemperatureLogitsWarper(0.7),
+            ...     ]
+            ... )
+            >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
+            >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
+            >>> outputs = model.sample(
+            ...     input_ids,
+            ...     logits_processor=logits_processor,
+            ...     logits_warper=logits_warper,
+            ...     stopping_criteria=stopping_criteria,
+            ... )
+            >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            ['Today is a beautiful day, and a wonderful day.\n\nI was lucky enough to meet the']
+            ```"""
+            # init values
+            logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+            stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+            if max_length is not None:
+                warnings.warn(
+                    "`max_length` is deprecated in this function, use"
+                    " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+                    UserWarning,
+                )
+                stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+            logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
+            pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+            eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+            output_attentions = (
+                output_attentions if output_attentions is not None else self.generation_config.output_attentions
+            )
+            output_hidden_states = (
+                output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+            )
+            return_dict_in_generate = (
+                return_dict_in_generate
+                if return_dict_in_generate is not None
+                else self.generation_config.return_dict_in_generate
+            )
+            # init attention / hidden states / scores tuples
+            scores = () if (return_dict_in_generate and output_scores) else None
+            decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+            cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+            decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+            # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+            if return_dict_in_generate and self.config.is_encoder_decoder:
+                encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+                encoder_hidden_states = (
+                    model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+                )
+            # keep track of which sequences are already finished
+            unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+            this_peer_finished = False  # used by synced_gpus only
+            # auto-regressive generation
+            while True:
+                if synced_gpus:
+                    # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                    # The following logic allows an early break if all peers finished generating their sequence
+                    this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                    # send 0.0 if we finished, 1.0 otherwise
+                    dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                    # did all peers finish? the reduced sum will be 0.0 then
+                    if this_peer_finished_flag.item() == 0.0:
+                        break
+                # prepare model inputs
+                model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+                # forward pass to get next token
+                outputs = self(
+                    **model_inputs,
+                    return_dict=True,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                )
+                if synced_gpus and this_peer_finished:
+                    continue  # don't waste resources running the code we don't need
+                next_token_logits = outputs.logits[:, -1, :]
+                # pre-process distribution
+                next_token_scores = logits_processor(input_ids, next_token_logits)
+                next_token_scores = logits_warper(input_ids, next_token_scores)
+                # Store scores, attentions and hidden_states when required
+                if return_dict_in_generate:
+                    if output_scores:
+                        scores += (next_token_scores,)
+                    if output_attentions:
+                        decoder_attentions += (
+                            (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                        )
+                        if self.config.is_encoder_decoder:
+                            cross_attentions += (outputs.cross_attentions,)
+                    if output_hidden_states:
+                        decoder_hidden_states += (
+                            (outputs.decoder_hidden_states,)
+                            if self.config.is_encoder_decoder
+                            else (outputs.hidden_states,)
+                        )
+                # sample
+                probs = nn.functional.softmax(next_token_scores, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+                # finished sentences should have their next token be a padding token
+                if eos_token_id is not None:
+                    if pad_token_id is None:
+                        raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+                    next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+                # update generated ids, model inputs, and length for next step
+                input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+                model_kwargs = self._update_model_kwargs_for_generation(
+                    outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+                )
+                # if eos_token was found in one sentence, set sentence to finished
+                if eos_token_id is not None:
+                    unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
+                # stop when each sentence is finished, or if we exceed the maximum length
+                if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                    if not synced_gpus:
+                        break
+                    else:
+                        this_peer_finished = True
+                else:
+                    yield next_tokens
+            if return_dict_in_generate:
+                if self.config.is_encoder_decoder:
+                    yield SampleEncoderDecoderOutput(
+                        sequences=input_ids,
+                        scores=scores,
+                        encoder_attentions=encoder_attentions,
+                        encoder_hidden_states=encoder_hidden_states,
+                        decoder_attentions=decoder_attentions,
+                        cross_attentions=cross_attentions,
+                        decoder_hidden_states=decoder_hidden_states,
+                    )
+                else:
+                    yield SampleDecoderOnlyOutput(
+                        sequences=input_ids,
+                        scores=scores,
+                        attentions=decoder_attentions,
+                        hidden_states=decoder_hidden_states,
+                    )
+            else:
+                yield next_tokens

cog.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+build:
+  gpu: true
+  cuda: "11.7"
+  python_version: "3.10"
+  python_requirements: requirements.txt
+  # commands run after the environment is setup
+  run:
+    - pip install flash-attn==0.2.8
+    - pip install triton==2.0.0.dev20221202
+    - pip install tensorizer==1.1.0
+    - echo 'deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main' | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+    - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
+    - apt-get update && apt-get install google-cloud-cli
+predict: "predict.py:Predictor"

predict.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import time
+from typing import Optional
+import subprocess
+import torch
+import os
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
+from tensorizer import TensorDeserializer
+from tensorizer.utils import no_init_or_tensor
+from collections import OrderedDict
+from cog import BasePredictor, ConcatenateIterator, Input, Path
+# from config import DEFAULT_MODEL_NAME, DEFAULT_CONFIG_PATH, load_tokenizer, load_tensorizer
+from subclass import YieldingReplitCode
+# Weights are either local or in a cloud bucket.
+# For development, point to a local path on disk.
+# This is the path from which we pull weights when there's no COG_WEIGHTS environment variable (COG_WEIGHTS is a thing for trainable models)
+# TENSORIZER_WEIGHTS_PATH = "model/model.tensors"
+TENSORIZER_WEIGHTS_PATH = "gs://replicate-weights/replit-code-v1-3b/model.tensors"
+# Set this to a GCP URL when pushing the model
+# TENSORIZER_WEIGHTS_PATH = None
+DEFAULT_CONFIG_PATH = "model/"
+TOKENIZER_PATH = "model/"
+def maybe_download(path):
+    if path.startswith("gs://"):
+        st = time.time()
+        output_path = "/tmp/weights.tensors"
+        subprocess.check_call(["gcloud", "storage", "cp", path, output_path])
+        print(f"weights downloaded in {time.time() - st}")
+        return output_path
+    return path
+class Predictor(BasePredictor):
+    def setup(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # set TOKENIZERS_PARALLELISM to false to avoid a warning
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        self.model = self.load_tensorizer(
+            weights=maybe_download(TENSORIZER_WEIGHTS_PATH), plaid_mode=True, cls=YieldingReplitCode, config_path=DEFAULT_CONFIG_PATH,
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
+    def load_tensorizer(self, weights, plaid_mode, cls, config_path):
+        st = time.time()
+        print(f"deserializing weights from {weights}")
+        config = AutoConfig.from_pretrained(config_path, trust_remote_code=True)
+        config.attn_config['attn_impl'] = 'triton'
+        # with no_init_or_tensor():
+        #     model = YieldingReplitCode.from_pretrained('./model/', config=config, trust_remote_code=True)
+        model = no_init_or_tensor(
+            lambda: cls.from_pretrained(
+                None, config=config, state_dict=OrderedDict(), trust_remote_code=True,
+            )
+        )
+        deserialized = TensorDeserializer(weights, plaid_mode=True)
+        deserialized.load_into_module(model)
+        try:
+          model = model.to(dtype=torch.bfloat16)
+        except:
+            pass
+        print(f"weights loaded in {time.time() - st}")
+        return model
+    def predict(
+        self,
+        prompt: str = Input(description=f"Text prompt"),
+        max_length: int = Input(
+            description="Maximum number of tokens to generate. A word is generally 2-3 tokens",
+            ge=1,
+            default=500,
+        ),
+        temperature: float = Input(
+            description="Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic, 0.75 is a good starting value.",
+            ge=0.01,
+            le=5,
+            default=0.75,
+        ),
+        top_p: float = Input(
+            description="When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens",
+            ge=0.01,
+            le=1.0,
+            default=1.0,
+        ),
+        repetition_penalty: float = Input(
+            description="Penalty for repeated words in generated text; 1 is no penalty, values greater than 1 discourage repetition, less than 1 encourage it.",
+            ge=0.01,
+            le=5,
+            default=1,
+        ),
+        length_penalty: float = Input(
+            description="Increasing the length_penalty parameter above 1.0 will cause the model to favor longer sequences, while decreasing it below 1.0 will cause the model to favor shorter sequences.",
+            ge=0.01,
+            le=5,
+            default=1,
+        ),
+        no_repeat_ngram_size: int = Input(
+            description="If set to int > 0, all ngrams of size no_repeat_ngram_size can only occur once.",
+            ge=0,
+            default=0,
+        ),
+        stop_sequence: str = Input(
+            description="Generation will hault if this token is produced. Currently, only single token stop sequences are support and it is recommended to use `###` as the stop sequence if you want to control generation termination.",
+            default=None,
+        ),
+        seed: int = Input(
+            description="Set seed for reproducible outputs. Set to -1 for random seed.",
+            ge=-1,
+            default=-1,
+        ),
+        debug: bool = Input(
+            description="provide debugging output in logs", default=False
+        ),
+    ) -> ConcatenateIterator[str]:
+        input = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
+        # set torch seed
+        if seed == -1:
+            torch.seed()
+        else:
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed(seed)
+        with torch.inference_mode():
+            first_token_yielded = False
+            prev_ids = []
+            for output in self.model.generate(
+                input,
+                max_length=max_length,
+                do_sample=True,
+                temperature=temperature,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                length_penalty=length_penalty,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+            ):
+                cur_id = output.item()
+                # in order to properly handle spaces, we need to do our own tokenizing. Fun!
+                # we're building up a buffer of sub-word / punctuation tokens until we hit a space, and then yielding whole words + punctuation.
+                cur_token = self.tokenizer.convert_ids_to_tokens(cur_id)
+                # skip initial newline, which this almost always yields. hack - newline id = 13.
+                if not first_token_yielded and not prev_ids and cur_id == 187:
+                    continue
+                # Ġ means a space, means we yield previous tokens
+                if cur_token.startswith("Ġ"):  # this is not a standard G.
+                    # first token
+                    if not prev_ids:
+                        prev_ids = [cur_id]
+                        continue
+                    # there are tokens to yield
+                    else:
+                        token = self.tokenizer.decode(prev_ids, clean_up_tokenization_spaces=False)
+                        prev_ids = [cur_id]
+                        if not first_token_yielded:
+                            # no leading space for first token
+                            token = token.strip()
+                            first_token_yielded = True
+                        yield token
+                                # End token
+                elif cur_token == "<|endoftext|>":
+                    break
+                elif stop_sequence and cur_token == stop_sequence:
+                    break
+                else:
+                    prev_ids.append(cur_id)
+                    continue
+            # remove any special tokens such as </s>
+            token = self.tokenizer.decode(prev_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+            if not first_token_yielded:
+                # no leading space for first token
+                token = token.strip()
+                first_token_yielded = True
+            yield token
+        if debug:
+            print(f"cur memory: {torch.cuda.memory_allocated()}")
+            print(f"max allocated: {torch.cuda.max_memory_allocated()}")
+            print(f"peak memory: {torch.cuda.max_memory_reserved()}")

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
-streamlit==1.25.0  # Latest stable version of Streamlit
-transformers==4.33.0  # Hugging Face Transformers library
-torch>=1.9.0  # PyTorch, required for Hugging Face models
-numpy>=1.21.0  # Numerical library for model dependencies

+einops==0.6.1
+sentencepiece==0.1.99
+torch==2.0.1
+transformers==4.29.2
+# flash-attn==0.2.8
+# triton==2.0.0.dev20221202

scripts/download_and_prepare_model.py ADDED Viewed

	@@ -0,0 +1,107 @@

+#!/usr/bin/env python
+import os
+import shutil
+import argparse
+import logging
+import sys
+import torch
+from distutils.dir_util import copy_tree
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from huggingface_hub import snapshot_download, login
+from tensorizer import TensorSerializer
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+from tensorize_model import tensorize_model
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, stream=sys.stdout)
+def download_model_from_hf_hub(
+        model_name: str,
+        model_path: str,
+        rm_existing_model: bool = True,
+    ) -> dict:
+    """
+    This function downloads a model from the Hugging Face Hub and saves it locally.
+    It also saves the tokenizer in a separate location so that it can be easely included in a docker Image
+    without including the model weights.
+    Args:
+        model_name (str): Name of model on hugging face hub
+        path (str): Local path where model is saved
+        rm_existing_model (bool, optional): Whether to remove the existing model or not. Defaults to False.
+    Returns:
+        dict: Dictionary containing the model name and path
+    """
+    # model_weights_path = os.path.join(os.getcwd(), "model_weights/torch_weights")
+    # model_path = os.path.join(model_weights_path, model_name)
+    if rm_existing_model:
+        logger.info(f"Removing existing model at {model_path}")
+        if os.path.exists(model_path):
+            shutil.rmtree(model_path)
+    # setup temporary directory
+    with TemporaryDirectory() as tmpdir:
+        logger.info(f"Downloading {model_name} weights to temp...")
+        snapshot_dir = snapshot_download(
+            repo_id=model_name,
+            cache_dir=tmpdir,
+            allow_patterns=["*.bin", "*.json", "*.md", "*.model", "*.py"],
+        )
+        # copy snapshot to model dir
+        logger.info(f"Copying weights to {model_path}...")
+        copy_tree(snapshot_dir, str(model_path))
+    return {"model_name": model_name, "model_path": model_path}
+def download_hf_model_and_copy_tokenizer(
+        model_name: str,
+        model_path: str,
+        tokenizer_path: str,
+        rm_existing_model: bool = True,
+):
+    model_info = download_model_from_hf_hub(model_name, model_path)
+    if tokenizer_path:
+        # Move tokenizer to separate location
+        logging.info(f"Copying tokenizer and model config to {tokenizer_path}...")
+        tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left")
+        tokenizer.save_pretrained(tokenizer_path)
+        # Set the source and destination file paths
+        config_path = os.path.join(model_path, "config.json")
+        # Use the shutil.copy() function to copy the file to the destination directory
+        shutil.copy(config_path, tokenizer_path)
+    return model_info
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str)
+    parser.add_argument("--model_path", type=str)
+    parser.add_argument("--tokenizer_path", type=str, default=None)
+    parser.add_argument("--hf_token", type=str, default=None)
+    parser.add_argument("--tensorize", action="store_true", default=False)
+    parser.add_argument("--dtype", type=str, default="fp32")
+    args = parser.parse_args()
+    if args.hf_token is not None:
+        login(token=args.hf_token)
+    # download_hf_model_and_copy_tokenizer(args.model_name, model_path=args.model_path, tokenizer_path=args.tokenizer_path)
+    tensorizer_path = os.path.join(args.model_path, "model.tensors")
+    if args.tensorize:
+        model = tensorize_model(args.model_name, model_path=args.model_path, dtype=args.dtype, tensorizer_path=tensorizer_path)

scripts/tensorize_model.py ADDED Viewed

	@@ -0,0 +1,91 @@

+#!/usr/bin/env python
+import torch
+import os
+import argparse
+import logging
+import sys
+from tensorizer import TensorSerializer
+from transformers import AutoModelForCausalLM, AutoConfig
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, stream=sys.stdout)
+def tensorize_model(
+        model_name: str,
+        model_path: str,
+        tensorizer_path: str,
+        dtype: str = "fp32",
+) -> dict:
+    """
+    Create a tensorized version of model weights. If fp16 or bf16 is True,
+    the model will be converted to fp16 or bf16.
+    If `model_path` is None weights will be saved in `./model_weights/torch_weights/model_name`.
+    If `tensorizer_path` is None weights will be saved in `./model_weights/tensorizer_weights/model_name/dtype_str`.
+    Args:
+        model_name (str): Name of model on hugging face hub
+        model_path (str, optional): Local path where model weights are saved.
+        tensorizer_path (str, optional): Local path where tensorizer weights are saved.
+        path (str): Local path where tensorized model weights are saved
+        dtype (str): One of `"fp32"`, `"fp16"`, and `"bf16"`. Defaults to `"fp32"`.
+    Returns:
+        dict: Dictionary containing the tensorized model path and dtype.
+    """
+    if dtype == 'fp32' or dtype is None:
+        torch_dtype = torch.float32
+    elif dtype == 'bf16':
+        torch_dtype = torch.bfloat16
+    elif dtype == 'fp16':
+        torch_dtype = torch.float16
+    logger.info(f"Loading {model_name} in {dtype} from {model_path}...")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path, trust_remote_code=True,
+    ).to('cuda:0')
+    logger.info(f"Tensorizing model {model_name} in {dtype} and writing tensors to {tensorizer_path}...")
+    serializer = TensorSerializer(tensorizer_path)
+    serializer.write_module(model)
+    serializer.close()
+    # Write config to tensorized model weights directory
+    # dir_path = os.path.dirname(tensorizer_path)
+    # config_path = os.path.join(dir_path, 'config.json')
+    model_config = model.config
+    model_config.save_pretrained(model_name)
+    logger.info(f"Tensorized model {model_name} in {dtype} and wrote tensors to {tensorizer_path} and config to {config_path}...")
+    return {"tensorized_weights_path": tensorizer_path, "dtype": dtype}
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=(
+        "A simple script for tensorizing a torch model."
+        )
+    )
+    parser.add_argument("--model_name", type=str)
+    parser.add_argument("--model_path", type=str, default=None)
+    parser.add_argument("--tensorizer_path", type=str, default=None)
+    parser.add_argument("--dtype", type=str, default="fp32")
+    args = parser.parse_args()
+    model_info = tensorize_model(
+        args.model_name,
+        model_path=args.model_path,
+        tensorizer_path=args.tensorizer_path,
+        dtype=args.dtype
+    )

subclass.py ADDED Viewed

	@@ -0,0 +1,284 @@

+"""sampling code pulled from Transformers & slightly modified to stream tokens"""
+import warnings
+from typing import List, Optional,  Union
+import torch
+import torch.distributed as dist
+from torch import nn
+from transformers.generation.logits_process import  LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria
+from transformers.generation.utils import SampleOutput, SampleDecoderOnlyOutput, SampleEncoderDecoderOutput
+# from transformers import AutoModelForCausalLM
+from model.modeling_mpt import MPTForCausalLM
+class YieldingReplitCode(MPTForCausalLM):
+    """Overriding sample to yield tokens"""
+    def sample(
+            self,
+            input_ids: torch.LongTensor,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            logits_warper: Optional[LogitsProcessorList] = None,
+            max_length: Optional[int] = None,
+            pad_token_id: Optional[int] = None,
+            eos_token_id: Optional[Union[int, List[int]]] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            output_scores: Optional[bool] = None,
+            return_dict_in_generate: Optional[bool] = None,
+            synced_gpus: Optional[bool] = False,
+            **model_kwargs,
+        ) -> Union[SampleOutput, torch.LongTensor]:
+            r"""
+            Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
+            can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+            <Tip warning={true}>
+            In most cases, you do not need to call [`~generation.GenerationMixin.sample`] directly. Use generate() instead.
+            For an overview of generation strategies and code examples, check the [following
+            guide](./generation_strategies).
+            </Tip>
+            Parameters:
+                input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                    The sequence used as a prompt for the generation.
+                logits_processor (`LogitsProcessorList`, *optional*):
+                    An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                    used to modify the prediction scores of the language modeling head applied at each generation step.
+                stopping_criteria (`StoppingCriteriaList`, *optional*):
+                    An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                    used to tell if the generation loop should stop.
+                logits_warper (`LogitsProcessorList`, *optional*):
+                    An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+                    to warp the prediction score distribution of the language modeling head applied before multinomial
+                    sampling at each generation step.
+                max_length (`int`, *optional*, defaults to 20):
+                    **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+                    tokens. The maximum length of the sequence to be generated.
+                pad_token_id (`int`, *optional*):
+                    The id of the *padding* token.
+                eos_token_id (`int`, *optional*):
+                    The id of the *end-of-sequence* token.
+                output_attentions (`bool`, *optional*, defaults to `False`):
+                    Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                    returned tensors for more details.
+                output_hidden_states (`bool`, *optional*, defaults to `False`):
+                    Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                    for more details.
+                output_scores (`bool`, *optional*, defaults to `False`):
+                    Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+                return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                    Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+                synced_gpus (`bool`, *optional*, defaults to `False`):
+                    Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                model_kwargs:
+                    Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                    an encoder-decoder model the kwargs should include `encoder_outputs`.
+            Return:
+                [`~generation.SampleDecoderOnlyOutput`], [`~generation.SampleEncoderDecoderOutput`] or `torch.LongTensor`:
+                A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+                [`~generation.SampleDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+                `return_dict_in_generate=True` or a [`~generation.SampleEncoderDecoderOutput`] if
+                `model.config.is_encoder_decoder=True`.
+            Examples:
+            ```python
+            >>> from transformers import (
+            ...     AutoTokenizer,
+            ...     AutoModelForCausalLM,
+            ...     LogitsProcessorList,
+            ...     MinLengthLogitsProcessor,
+            ...     TopKLogitsWarper,
+            ...     TemperatureLogitsWarper,
+            ...     StoppingCriteriaList,
+            ...     MaxLengthCriteria,
+            ... )
+            >>> import torch
+            >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+            >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+            >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
+            >>> model.config.pad_token_id = model.config.eos_token_id
+            >>> model.generation_config.pad_token_id = model.config.eos_token_id
+            >>> input_prompt = "Today is a beautiful day, and"
+            >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+            >>> # instantiate logits processors
+            >>> logits_processor = LogitsProcessorList(
+            ...     [
+            ...         MinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
+            ...     ]
+            ... )
+            >>> # instantiate logits processors
+            >>> logits_warper = LogitsProcessorList(
+            ...     [
+            ...         TopKLogitsWarper(50),
+            ...         TemperatureLogitsWarper(0.7),
+            ...     ]
+            ... )
+            >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
+            >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
+            >>> outputs = model.sample(
+            ...     input_ids,
+            ...     logits_processor=logits_processor,
+            ...     logits_warper=logits_warper,
+            ...     stopping_criteria=stopping_criteria,
+            ... )
+            >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            ['Today is a beautiful day, and a wonderful day.\n\nI was lucky enough to meet the']
+            ```"""
+            # init values
+            logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+            stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+            if max_length is not None:
+                warnings.warn(
+                    "`max_length` is deprecated in this function, use"
+                    " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+                    UserWarning,
+                )
+                stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+            logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
+            pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+            eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+            output_attentions = (
+                output_attentions if output_attentions is not None else self.generation_config.output_attentions
+            )
+            output_hidden_states = (
+                output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+            )
+            return_dict_in_generate = (
+                return_dict_in_generate
+                if return_dict_in_generate is not None
+                else self.generation_config.return_dict_in_generate
+            )
+            # init attention / hidden states / scores tuples
+            scores = () if (return_dict_in_generate and output_scores) else None
+            decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+            cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+            decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+            # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+            if return_dict_in_generate and self.config.is_encoder_decoder:
+                encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+                encoder_hidden_states = (
+                    model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+                )
+            # keep track of which sequences are already finished
+            unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+            this_peer_finished = False  # used by synced_gpus only
+            # auto-regressive generation
+            while True:
+                if synced_gpus:
+                    # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                    # The following logic allows an early break if all peers finished generating their sequence
+                    this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                    # send 0.0 if we finished, 1.0 otherwise
+                    dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                    # did all peers finish? the reduced sum will be 0.0 then
+                    if this_peer_finished_flag.item() == 0.0:
+                        break
+                # prepare model inputs
+                model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+                # forward pass to get next token
+                outputs = self(
+                    **model_inputs,
+                    return_dict=True,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                )
+                if synced_gpus and this_peer_finished:
+                    continue  # don't waste resources running the code we don't need
+                next_token_logits = outputs.logits[:, -1, :]
+                # pre-process distribution
+                next_token_scores = logits_processor(input_ids, next_token_logits)
+                next_token_scores = logits_warper(input_ids, next_token_scores)
+                # Store scores, attentions and hidden_states when required
+                if return_dict_in_generate:
+                    if output_scores:
+                        scores += (next_token_scores,)
+                    if output_attentions:
+                        decoder_attentions += (
+                            (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                        )
+                        if self.config.is_encoder_decoder:
+                            cross_attentions += (outputs.cross_attentions,)
+                    if output_hidden_states:
+                        decoder_hidden_states += (
+                            (outputs.decoder_hidden_states,)
+                            if self.config.is_encoder_decoder
+                            else (outputs.hidden_states,)
+                        )
+                # sample
+                probs = nn.functional.softmax(next_token_scores, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+                # finished sentences should have their next token be a padding token
+                if eos_token_id is not None:
+                    if pad_token_id is None:
+                        raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+                    next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+                # update generated ids, model inputs, and length for next step
+                input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+                model_kwargs = self._update_model_kwargs_for_generation(
+                    outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+                )
+                # if eos_token was found in one sentence, set sentence to finished
+                if eos_token_id is not None:
+                    unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
+                # stop when each sentence is finished, or if we exceed the maximum length
+                if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                    if not synced_gpus:
+                        break
+                    else:
+                        this_peer_finished = True
+                else:
+                    yield next_tokens
+            if return_dict_in_generate:
+                if self.config.is_encoder_decoder:
+                    yield SampleEncoderDecoderOutput(
+                        sequences=input_ids,
+                        scores=scores,
+                        encoder_attentions=encoder_attentions,
+                        encoder_hidden_states=encoder_hidden_states,
+                        decoder_attentions=decoder_attentions,
+                        cross_attentions=cross_attentions,
+                        decoder_hidden_states=decoder_hidden_states,
+                    )
+                else:
+                    yield SampleDecoderOnlyOutput(
+                        sequences=input_ids,
+                        scores=scores,
+                        attentions=decoder_attentions,
+                        hidden_states=decoder_hidden_states,
+                    )
+            else:
+                yield next_tokens