Spaces:
Runtime error
Runtime error
Feature(MInference): update information
Browse files
app.py
CHANGED
|
@@ -13,7 +13,10 @@ from transformers.utils.import_utils import _is_package_available
|
|
| 13 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 14 |
|
| 15 |
|
| 16 |
-
DESCRIPTION = """
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
_Huiqiang Jiang†, Yucheng Li†, Chengruidong Zhang†, Qianhui Wu, Xufang Luo, Surin Ahn, Zhenhua Han, Amir H. Abdi, Dongsheng Li, Chin-Yew Lin, Yuqing Yang and Lili Qiu_
|
| 19 |
|
|
@@ -73,7 +76,7 @@ if torch.cuda.is_available() and _is_package_available("pycuda"):
|
|
| 73 |
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
|
| 74 |
|
| 75 |
|
| 76 |
-
@spaces.GPU(duration=120)
|
| 77 |
def chat_llama3_8b(
|
| 78 |
message: str, history: list, temperature: float, max_new_tokens: int
|
| 79 |
) -> str:
|
|
|
|
| 13 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 14 |
|
| 15 |
|
| 16 |
+
DESCRIPTION = """
|
| 17 |
+
<div>
|
| 18 |
+
<h1>MInference 1.0: Accelerating Pre-filling for Long-Context LLMs via Dynamic Sparse Attention (Under Review, ES-FoMo @ ICML'24)</h1>
|
| 19 |
+
</div>
|
| 20 |
|
| 21 |
_Huiqiang Jiang†, Yucheng Li†, Chengruidong Zhang†, Qianhui Wu, Xufang Luo, Surin Ahn, Zhenhua Han, Amir H. Abdi, Dongsheng Li, Chin-Yew Lin, Yuqing Yang and Lili Qiu_
|
| 22 |
|
|
|
|
| 76 |
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
|
| 77 |
|
| 78 |
|
| 79 |
+
# @spaces.GPU(duration=120)
|
| 80 |
def chat_llama3_8b(
|
| 81 |
message: str, history: list, temperature: float, max_new_tokens: int
|
| 82 |
) -> str:
|