diff --git a/README.md b/README.md
index 83a91861..0c1c2774 100644
--- a/README.md
+++ b/README.md
@@ -178,7 +178,7 @@ Optionally, you can use the following command-line flags:
 
 | Flag                                       | Description |
 |--------------------------------------------|-------------|
-| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen |
+| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv |
 
 #### Accelerate/transformers
 
@@ -255,14 +255,6 @@ Optionally, you can use the following command-line flags:
 | `--warmup_autotune`    | (triton) Enable warmup autotune. |
 | `--fused_mlp`          | (triton) Enable fused mlp. |
 
-#### FlexGen
-
-| Flag             | Description |
-|------------------|-------------|
-| `--percent PERCENT [PERCENT ...]` | FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0). |
-| `--compress-weight`               | FlexGen: Whether to compress weight (default: False).|
-| `--pin-weight [PIN_WEIGHT]`       | FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%). |
-
 #### DeepSpeed
 
 | Flag                                  | Description |
diff --git a/docs/README.md b/docs/README.md
index 6c5c4db8..a4bcdb61 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -8,7 +8,6 @@
 * [Docker](Docker.md)
 * [ExLlama](ExLlama.md)
 * [Extensions](Extensions.md)
-* [FlexGen](FlexGen.md)
 * [Generation parameters](Generation-parameters.md)
 * [GGML (llama.cpp) models](GGML-llama.cpp-models.md)
 * [GPT-4chan model](GPT-4chan-model.md)
diff --git a/modules/shared.py b/modules/shared.py
index 937b4c51..3f2fff29 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -155,11 +155,6 @@ parser.add_argument('--desc_act', action='store_true', help='For models that don
 parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7")
 parser.add_argument('--max_seq_len', type=int, default=2048, help="Maximum sequence length.")
 
-# FlexGen
-parser.add_argument('--percent', type=int, nargs="+", default=[0, 100, 100, 0, 100, 0], help='FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).')
-parser.add_argument("--compress-weight", action="store_true", help="FlexGen: activate weight compression.")
-parser.add_argument("--pin-weight", type=str2bool, nargs="?", const=True, default=True, help="FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%%).")
-
 # DeepSpeed
 parser.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
 parser.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.')
diff --git a/requirements.txt b/requirements.txt
index f085628a..4a4f03f9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,6 @@ colorama
 datasets
 einops
 fastapi==0.95.2
-flexgen==0.1.7
 gradio_client==0.2.5
 gradio==3.33.1
 markdown