API 参考

使用指南-核心模块详解

核心模块能力说明。

核心模块详解

<!-- ### 1. 数据访问层 (Data Access Layer)

nebula-data-access

核心抽象层,提供统一的数据访问接口

// 统一Repository接口
@Repository
public class UserRepository extends AbstractRepository<User, Long> {

    @Override
    protected Class<User> getEntityClass() {
        return User.class;
    }
}

// 查询构建器
QueryBuilder query = DefaultQueryBuilder.create()
    .eq("status", "ACTIVE")
    .like("name", "张%")
    .gt("createTime", lastWeek)
    .build();
``` -->

#### nebula-data-persistence (MyBatis-Plus集成)
**关系型数据库持久化支持**
```yaml
nebula:
  data:
    persistence:
      # 读写分离配置
      read-write-separation:
        enabled: true
        master:
          url: jdbc:mysql://master:3306/nebula
          username: root
          password: password
        slave:
          url: jdbc:mysql://slave:3306/nebula
          username: reader
          password: password
      # 分库分表配置  
      sharding:
        enabled: true
        tables:
          user:
            actual-data-nodes: ds_${0..1}.user_${0..3}
            table-strategy:
              inline:
                sharding-column: id
                algorithm-expression: user_${id % 4}

<!-- #### nebula-data-mongodb (NoSQL支持) MongoDB集成支持

@Service
public class DocumentService {

    @Autowired
    private MongoRepository mongoRepository;

    public void saveDocument(Document doc) {
        mongoRepository.save(doc);
    }

    public List<Document> findByCategory(String category) {
        return mongoRepository.findByCategory(category);
    }
}
``` -->

#### nebula-data-cache (缓存支持)
**多级缓存管理**
```yaml
nebula:
  data:
    cache:
      # 多级缓存配置
      multi-level:
        enabled: true
        local:
          type: caffeine
          max-size: 10000
          expire-after-write: 5m
        remote:
          type: redis
          expire-after-write: 1h
          key-prefix: "nebula:"

2. 消息传递层 (Messaging Layer)

nebula-messaging-core & nebula-messaging-rabbitmq

消息队列抽象和RabbitMQ实现

@Service
public class NotificationService {

    @Autowired
    private MessageManager messageManager;

    public void sendNotification(String userId, String message) {
        Message<String> msg = Message.<String>builder()
            .topic("user-notifications")
            .payload(message)
            .build();

        messageManager.getProducer().send("user-notifications", msg);
    }

    @MessageHandler("user-notifications")
    public void handleNotification(Message<String> message) {
        // 处理通知消息
        log.info("处理通知: {}", message.getPayload());
    }
}

3. 服务发现与RPC层

nebula-discovery-core & nebula-discovery-nacos

服务注册发现

nebula:
  discovery:
    nacos:
      enabled: true
      server-addr: localhost:8848
      namespace: nebula-dev
      group: DEFAULT_GROUP

nebula-rpc-core & nebula-rpc-http

远程调用支持

@RpcClient("user-service")
public interface UserRpcClient {

    @RpcCall("/api/users/{id}")
    User getUserById(@PathParam("id") Long id);

    @RpcCall(value = "/api/users", method = "POST")
    User createUser(@RequestBody CreateUserRequest request);
}

4. 对象存储层 (Storage Layer)

nebula-storage-core, nebula-storage-minio, nebula-storage-aliyun-oss

统一对象存储接口

@Service
public class FileService {

    @Autowired
    private StorageService storageService;

    public String uploadFile(MultipartFile file) {
        ObjectMetadata metadata = ObjectMetadata.builder()
            .contentType(file.getContentType())
            .contentLength(file.getSize())
            .build();

        StorageResult result = storageService.upload(
            "documents/" + file.getOriginalFilename(),
            file.getInputStream(),
            metadata
        );

        return result.getUrl();
    }
}

配置示例:

nebula:
  storage:
    # MinIO配置
    minio:
      enabled: true
      endpoint: http://localhost:9000
      access-key: minioadmin
      secret-key: minioadmin
      default-bucket: nebula-files

    # 阿里云OSS配置  
    aliyun:
      oss:
        enabled: false
        endpoint: https://oss-cn-hangzhou.aliyuncs.com
        access-key-id: ${ALIYUN_ACCESS_KEY}
        access-key-secret: ${ALIYUN_SECRET_KEY}
        default-bucket: nebula-oss

5. 搜索引擎层 (Search Layer)

nebula-search-core & nebula-search-elasticsearch

全文搜索支持

@Service
public class ProductSearchService {

    @Autowired
    private SearchService searchService;

    public void indexProduct(Product product) {
        SearchDocument document = SearchDocument.builder()
            .id(product.getId().toString())
            .content(product.getName() + " " + product.getDescription())
            .metadata(Map.of(
                "category", product.getCategory(),
                "price", product.getPrice(),
                "brand", product.getBrand()
            ))
            .build();

        searchService.index("products", document);
    }

    public SearchResult searchProducts(String query, String category) {
        SearchQuery searchQuery = SearchQuery.builder()
            .query(query)
            .filter("category", category)
            .size(20)
            .build();

        return searchService.search("products", searchQuery);
    }
}

6. 第三方集成层 (Integration Layer)

nebula-integration-payment

支付集成抽象

@Service
public class OrderService {

    @Autowired
    private PaymentService paymentService;

    public PaymentResponse createPayment(Order order) {
        PaymentRequest request = PaymentRequest.builder()
            .orderNo(order.getOrderNo())
            .amount(order.getTotalAmount())
            .currency("CNY")
            .subject(order.getTitle())
            .buyerInfo(BuyerInfo.builder()
                .buyerId(order.getUserId().toString())
                .buyerName(order.getUserName())
                .build())
            .build();

        return paymentService.createPayment(request);
    }
}

配置示例:

nebula:
  payment:
    # Mock支付(开发测试)
    mock:
      enabled: true
      auto-success-delay: 60

    # 支付宝配置
    alipay:
      enabled: false
      app-id: ${ALIPAY_APP_ID}
      private-key: ${ALIPAY_PRIVATE_KEY}
      public-key: ${ALIPAY_PUBLIC_KEY}

    # 微信支付配置
    wechat-pay:
      enabled: false
      app-id: ${WECHAT_APP_ID}
      mch-id: ${WECHAT_MCH_ID}
      mch-key: ${WECHAT_MCH_KEY}

7. 人工智能层 (AI Layer)

nebula-ai-core & nebula-ai-spring

AI能力集成 (基于Spring AI)

@Service
public class AIService {

    @Autowired
    private ChatService chatService;

    @Autowired
    private EmbeddingService embeddingService;

    @Autowired
    private VectorStoreService vectorStoreService;

    // 智能聊天
    public String chat(String message) {
        ChatResponse response = chatService.chat(message);
        return response.getContent();
    }

    // 文档智能问答 (RAG)
    public String intelligentQA(String question) {
        // 1. 搜索相关文档
        SearchResult searchResult = vectorStoreService.search(question, 3);

        // 2. 构建上下文
        String context = searchResult.getContents()
            .stream()
            .collect(Collectors.joining("\n"));

        // 3. 生成回答
        List<ChatMessage> messages = List.of(
            ChatMessage.system("基于以下上下文回答问题:\n" + context),
            ChatMessage.user(question)
        );

        return chatService.chat(messages).getContent();
    }
}

配置示例:

nebula:
  ai:
    enabled: true
    # 聊天配置
    chat:
      default-provider: openai
      providers:
        openai:
          api-key: ${OPENAI_API_KEY}
          model: gpt-3.5-turbo
          options:
            temperature: 0.7
            max-tokens: 1000

    # 嵌入配置
    embedding:
      default-provider: openai
      providers:
        openai:
          api-key: ${OPENAI_API_KEY}
          model: text-embedding-ada-002

    # 向量存储配置        
    vector-store:
      default-provider: chroma
      providers:
        chroma:
          host: localhost
          port: 8000
          collection-name: nebula-docs

8. 网页爬虫层 (Crawler Layer)

Nebula Crawler 是一套完整的网页数据采集模块组,支持 HTTP 和浏览器两种采集模式,提供代理池管理和验证码处理能力。

模块架构

graph TB
    subgraph 爬虫模块组
        A[nebula-crawler-core<br/>核心抽象层] --> B[nebula-crawler-http<br/>HTTP爬虫引擎]
        A --> C[nebula-crawler-browser<br/>浏览器爬虫引擎]
        A --> D[nebula-crawler-proxy<br/>代理池管理]
        A --> E[nebula-crawler-captcha<br/>验证码处理]
    end

    B --> F[OkHttp 客户端]
    C --> G[Playwright]
    D --> H[Redis 缓存]
    E --> I[ddddocr/第三方API]

nebula-crawler-core (核心抽象层)

统一的爬虫引擎接口和请求/响应模型

// 爬虫引擎接口
public interface CrawlerEngine {
    String getType();                                        // 引擎类型
    CrawlerResponse crawl(CrawlerRequest request);          // 同步爬取
    CompletableFuture<CrawlerResponse> crawlAsync(CrawlerRequest request); // 异步爬取
    List<CrawlerResponse> crawlBatch(List<CrawlerRequest> requests);       // 批量爬取
    void shutdown();                                         // 关闭引擎
    boolean isHealthy();                                     // 健康检查
}
// 构建请求
CrawlerRequest request = CrawlerRequest.get("https://example.com")
    .header("Accept", "text/html")
    .timeout(30000)
    .build();

// 浏览器渲染请求
CrawlerRequest jsRequest = CrawlerRequest.renderPage("https://spa-app.com")
    .waitSelector("#content")
    .waitTimeout(5000)
    .screenshot(true)
    .build();
// 处理响应
CrawlerResponse response = engine.crawl(request);
if (response.isSuccess()) {
    // 解析为 Jsoup Document
    Document doc = response.asDocument();
    String title = doc.select("title").text();

    // 或解析为 JSON
    Map<String, Object> data = response.asMap();
}

nebula-crawler-http (HTTP爬虫引擎)

基于 OkHttp 的高性能 HTTP 爬虫

@Service
public class DataCrawlerService {

    @Autowired
    private HttpCrawlerEngine httpEngine;

    public String fetchPage(String url) {
        CrawlerRequest request = CrawlerRequest.get(url)
            .header("User-Agent", "Mozilla/5.0...")
            .retryCount(3)
            .build();

        CrawlerResponse response = httpEngine.crawl(request);

        if (response.isSuccess()) {
            return response.getContent();
        } else {
            log.error("爬取失败: {}, 错误: {}", url, response.getErrorMessage());
            return null;
        }
    }

    // 批量爬取
    public List<String> fetchPages(List<String> urls) {
        List<CrawlerRequest> requests = urls.stream()
            .map(CrawlerRequest::get)
            .collect(Collectors.toList());

        return httpEngine.crawlBatch(requests).stream()
            .filter(CrawlerResponse::isSuccess)
            .map(CrawlerResponse::getContent)
            .collect(Collectors.toList());
    }
}

特性: - 连接池管理(可配置最大连接数、保活时间) - User-Agent 轮换 - QPS 限流 - 自动重试 - 代理支持

nebula-crawler-browser (浏览器爬虫引擎)

基于 Playwright 的浏览器自动化引擎

@Service
public class DynamicPageCrawler {

    @Autowired
    private BrowserCrawlerEngine browserEngine;

    public String crawlDynamicPage(String url) {
        CrawlerRequest request = CrawlerRequest.renderPage(url)
            .waitSelector(".content-loaded")     // 等待元素出现
            .waitTimeout(10000)                  // 等待超时
            .screenshot(true)                    // 截图
            .build();

        CrawlerResponse response = browserEngine.crawl(request);

        if (response.isSuccess()) {
            // 获取渲染后的 HTML
            String html = response.getContent();

            // 获取截图
            byte[] screenshot = response.getScreenshot();

            return html;
        }
        return null;
    }
}

运行模式: - LOCAL:本地启动浏览器实例,适合开发调试 - REMOTE:连接远程 Playwright Server,支持 Docker/K8s 部署

远程模式配置:

nebula:
  crawler:
    browser:
      enabled: true
      mode: REMOTE
      remote:
        endpoints:
          - ws://playwright-server-01:9222
          - ws://playwright-server-02:9222
        load-balance-strategy: ROUND_ROBIN

nebula-crawler-proxy (代理池管理)

统一的代理IP管理和轮换

@Service
public class ProxyCrawlerService {

    @Autowired
    private HttpCrawlerEngine engine;

    @Autowired
    private ProxyProvider proxyProvider;

    public String crawlWithProxy(String url) {
        // 获取可用代理
        Proxy proxy = proxyProvider.getProxy();

        CrawlerRequest request = CrawlerRequest.get(url)
            .proxy(proxy)
            .build();

        CrawlerResponse response = engine.crawl(request);

        // 上报代理使用结果
        if (response.isSuccess()) {
            proxyProvider.reportSuccess(proxy);
        } else {
            proxyProvider.reportFailure(proxy);
        }

        return response.getContent();
    }
}

代理源配置:

nebula:
  crawler:
    proxy:
      enabled: true
      min-available: 10
      check-url: https://www.baidu.com
      # 静态代理
      static-proxies:
        - http://proxy1:8080
        - socks5://proxy2:1080
      # API代理源
      api-sources:
        - name: provider1
          url: http://api.proxy-provider.com/get
          format: json

特性: - 多代理源支持(静态配置、API获取) - 自动健康检查和失效剔除 - 智能轮换策略 - Redis 持久化存储

nebula-crawler-captcha (验证码处理)

多类型验证码识别和处理

@Service
public class CaptchaCrawler {

    @Autowired
    private CaptchaManager captchaManager;

    public String solveCaptcha(byte[] imageData, CaptchaType type) {
        switch (type) {
            case IMAGE:
                // 图片验证码识别
                return captchaManager.solveImage(imageData);

            case SLIDER:
                // 滑块验证码 - 返回滑动距离
                return captchaManager.solveSlider(backgroundImage, sliderImage);

            case CLICK:
                // 点选验证码 - 返回点击坐标
                return captchaManager.solveClick(imageData, targetText);

            default:
                throw new UnsupportedOperationException("不支持的验证码类型");
        }
    }
}

支持的验证码类型: - 图片验证码(本地 ddddocr 识别) - 滑块验证码(OpenCV 计算滑动距离) - 点选验证码 - 旋转验证码 - 第三方平台(2Captcha、Anti-Captcha)

配置示例:

nebula:
  crawler:
    captcha:
      enabled: true
      local-ocr-enabled: true
      ddddocr-url: http://ddddocr-service:8866
      opencv-url: http://opencv-service:8867
      providers:
        - name: 2captcha
          api-key: ${CAPTCHA_API_KEY}
          enabled: true
          priority: 1

Maven 依赖

<!-- 完整爬虫功能 -->
<dependency>
    <groupId>io.nebula</groupId>
    <artifactId>nebula-crawler-core</artifactId>
</dependency>
<dependency>
    <groupId>io.nebula</groupId>
    <artifactId>nebula-crawler-http</artifactId>
</dependency>
<dependency>
    <groupId>io.nebula</groupId>
    <artifactId>nebula-crawler-browser</artifactId>
</dependency>
<dependency>
    <groupId>io.nebula</groupId>
    <artifactId>nebula-crawler-proxy</artifactId>
</dependency>
<dependency>
    <groupId>io.nebula</groupId>
    <artifactId>nebula-crawler-captcha</artifactId>
</dependency>

完整使用示例

@Service
@Slf4j
public class SupplierDataCrawler {

    @Autowired
    private HttpCrawlerEngine httpEngine;

    @Autowired
    private BrowserCrawlerEngine browserEngine;

    @Autowired
    private ProxyProvider proxyProvider;

    @Autowired
    private CaptchaManager captchaManager;

    /**
     * 采集供应商数据
     */
    public SupplierData crawlSupplier(String companyName) {
        // 1. 静态页面采集(使用HTTP引擎)
        CrawlerRequest listRequest = CrawlerRequest.get("https://example.com/search")
            .param("keyword", companyName)
            .build();

        CrawlerResponse listResponse = httpEngine.crawl(listRequest);
        String detailUrl = parseDetailUrl(listResponse);

        // 2. 动态页面采集(使用浏览器引擎)
        CrawlerRequest detailRequest = CrawlerRequest.renderPage(detailUrl)
            .waitSelector(".company-info")
            .proxy(proxyProvider.getProxy())
            .build();

        CrawlerResponse detailResponse = browserEngine.crawl(detailRequest);

        // 3. 解析数据
        Document doc = detailResponse.asDocument();
        return parseSupplierData(doc);
    }
}